diff mbox series

[4/4] cxl: Make cxl_dpa_alloc() DPA partition number agnostic

Message ID 173709425022.753996.16667967718406367188.stgit@dwillia2-xfh.jf.intel.com
State New
Headers show
Series cxl: DPA partition metadata is a mess... | expand

Commit Message

Dan Williams Jan. 17, 2025, 6:10 a.m. UTC
cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM
allocations being distinct from RAM allocations in specific ways when in
practice the allocation rules are only relative to DPA partition index.

The rules for cxl_dpa_alloc() are:

- allocations can only come from 1 partition

- if allocating at partition-index-N, all free space in partitions less
  than partition-index-N must be skipped over

Use the new 'struct cxl_dpa_partition' array to support allocation with
an arbitrary number of DPA partitions on the device.

A follow-on patch can go further to cleanup 'enum cxl_decoder_mode'
concept and supersede it with looking up the memory properties from
partition metadata.

Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Alejandro Lucero <alucerop@amd.com>
Cc: Ira Weiny <ira.weiny@intel.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/cxl/core/hdm.c |  167 +++++++++++++++++++++++++++++++++---------------
 drivers/cxl/cxlmem.h   |    9 +++
 2 files changed, 125 insertions(+), 51 deletions(-)

Comments

Jonathan Cameron Jan. 17, 2025, 11:12 a.m. UTC | #1
On Thu, 16 Jan 2025 22:10:50 -0800
Dan Williams <dan.j.williams@intel.com> wrote:

> cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM
> allocations being distinct from RAM allocations in specific ways when in
> practice the allocation rules are only relative to DPA partition index.
> 
> The rules for cxl_dpa_alloc() are:
> 
> - allocations can only come from 1 partition
> 
> - if allocating at partition-index-N, all free space in partitions less
>   than partition-index-N must be skipped over
> 
> Use the new 'struct cxl_dpa_partition' array to support allocation with
> an arbitrary number of DPA partitions on the device.
> 
> A follow-on patch can go further to cleanup 'enum cxl_decoder_mode'
> concept and supersede it with looking up the memory properties from
> partition metadata.

If we'd move to meta data and these were tightly packed then I'd be fine
with nr_partitions. Until that step, I find it confusing.

A few comments inline. This series does bring some advantages though
at cost of code that needs a bit more documentation at the very least.

> 
> Cc: Dave Jiang <dave.jiang@intel.com>
> Cc: Alejandro Lucero <alucerop@amd.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>  drivers/cxl/core/hdm.c |  167 +++++++++++++++++++++++++++++++++---------------
>  drivers/cxl/cxlmem.h   |    9 +++
>  2 files changed, 125 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 7e1559b3ed88..4a2816102a1e 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -223,6 +223,30 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
>  

Some documentation would be useful. I'm not sure I understand
this algorithm correctly.

I think this complexity is all about ensuring that skip regions have
their resources broken up on partition boundaries?

Can we potentially relax constraints a little more to make this
easier to read by not caring on the ordering?  Find overlap
of skip region with any partition and remove that bit unconditionally.

> +static void release_skip(struct cxl_dev_state *cxlds,
> +			 const resource_size_t skip_base,
> +			 const resource_size_t skip_len)
> +{
> +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> +
> +	for (int i = 0; i < cxlds->nr_partitions; i++) {
> +		const struct resource *part_res = &cxlds->part[i].res;
> +		resource_size_t skip_end, skip_size;
> +
> +		if (skip_start < part_res->start || skip_start > part_res->end)
> +			continue;
> +
> +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> +		skip_size = skip_end - skip_start + 1;
> +		__release_region(&cxlds->dpa_res, skip_start, skip_size);
> +		skip_start += skip_size;
> +		skip_rem -= skip_size;
> +
> +		if (!skip_rem)
> +			break;
> +	}
> +}
> +
>  /*
>   * Must be called in a context that synchronizes against this decoder's
>   * port ->remove() callback (like an endpoint decoder sysfs attribute)
> @@ -241,7 +265,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	skip_start = res->start - cxled->skip;
>  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
>  	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +		release_skip(cxlds, skip_start, cxled->skip);
>  	cxled->skip = 0;
>  	cxled->dpa_res = NULL;
>  	put_device(&cxled->cxld.dev);
> @@ -268,6 +292,47 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>  	__cxl_dpa_release(cxled);
>  }
>  
> +static int request_skip(struct cxl_dev_state *cxlds,
> +			struct cxl_endpoint_decoder *cxled,
> +			const resource_size_t skip_base,
> +			const resource_size_t skip_len)
> +{
> +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> +
> +	for (int i = 0; i < cxlds->nr_partitions; i++) {

Likewise, if we relax a constraint on ordering can we make this simpler?
Would just need to keep track on whether we had reserved enough. I'm not
100% sure that is sufficient for the final error check.

> +		const struct resource *part_res = &cxlds->part[i].res;
> +		struct cxl_port *port = cxled_to_port(cxled);
> +		resource_size_t skip_end, skip_size;
> +		struct resource *res;
> +
> +		if (skip_start < part_res->start || skip_start > part_res->end)
> +			continue;
> +
> +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> +		skip_size = skip_end - skip_start + 1;
> +
> +		res = __request_region(&cxlds->dpa_res, skip_start, skip_size,
> +				       dev_name(&cxled->cxld.dev), 0);
> +		if (!res) {
> +			dev_dbg(cxlds->dev,
> +				"decoder%d.%d: failed to reserve skipped space\n",
> +				port->id, cxled->cxld.id);
> +			break;
> +		}
> +		skip_start += skip_size;
> +		skip_rem -= skip_size;
> +		if (!skip_rem)
> +			break;
> +	}
> +
> +	if (skip_rem == 0)
> +		return 0;
> +
> +	release_skip(cxlds, skip_base, skip_len - skip_rem);
Ah, this complicates possibility of relaxations as we'd need to pass in what
partion number we'd reached when fail occurred.
Maybe this is the best algorithm, but I'd definitely like docs for this
function to make it clear what it's assumptions are (paritions in order of DPA etc)
> +
> +	return -EBUSY;
> +}
> +


> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
> index 2e728d4b7327..43acd48b300f 100644
> --- a/drivers/cxl/cxlmem.h
> +++ b/drivers/cxl/cxlmem.h
> @@ -515,6 +515,15 @@ static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds)
>  	return resource_size(res);
>  }
>  
> +static inline bool cxl_partition_contains(struct cxl_dev_state *cxlds,
> +					  unsigned int part,
> +					  struct resource *res)
> +{
> +	if (part >= cxlds->nr_partitions)
> +		return false;
> +	return resource_contains(&cxlds->part[part].res, res);
As per previous review. zero size resource never contains, so can drop the check
on nr_partitions and instead check against MAX that the core initializes to empty
(and might be overwritten by the drivers).
> +}
> +
>  static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
>  {
>  	return dev_get_drvdata(cxl_mbox->host);
> 
>
Alejandro Lucero Palau Jan. 17, 2025, 3:42 p.m. UTC | #2
On 1/17/25 06:10, Dan Williams wrote:
> cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM
> allocations being distinct from RAM allocations in specific ways when in
> practice the allocation rules are only relative to DPA partition index.
>
> The rules for cxl_dpa_alloc() are:
>
> - allocations can only come from 1 partition
>
> - if allocating at partition-index-N, all free space in partitions less
>    than partition-index-N must be skipped over


In my view, you are mixing the current code with the new code in this 
explanation. It would be better to say the current code assumption is 
just two partitions, ram and pmem, but DCD changes the game.


> Use the new 'struct cxl_dpa_partition' array to support allocation with
> an arbitrary number of DPA partitions on the device.
>
> A follow-on patch can go further to cleanup 'enum cxl_decoder_mode'
> concept and supersede it with looking up the memory properties from
> partition metadata.
>
> Cc: Dave Jiang <dave.jiang@intel.com>
> Cc: Alejandro Lucero <alucerop@amd.com>
> Cc: Ira Weiny <ira.weiny@intel.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> ---
>   drivers/cxl/core/hdm.c |  167 +++++++++++++++++++++++++++++++++---------------
>   drivers/cxl/cxlmem.h   |    9 +++
>   2 files changed, 125 insertions(+), 51 deletions(-)
>
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 7e1559b3ed88..4a2816102a1e 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -223,6 +223,30 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
>   }
>   EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
>   
> +static void release_skip(struct cxl_dev_state *cxlds,
> +			 const resource_size_t skip_base,
> +			 const resource_size_t skip_len)
> +{
> +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> +
> +	for (int i = 0; i < cxlds->nr_partitions; i++) {
> +		const struct resource *part_res = &cxlds->part[i].res;
> +		resource_size_t skip_end, skip_size;
> +
> +		if (skip_start < part_res->start || skip_start > part_res->end)
> +			continue;
> +
> +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> +		skip_size = skip_end - skip_start + 1;
> +		__release_region(&cxlds->dpa_res, skip_start, skip_size);
> +		skip_start += skip_size;
> +		skip_rem -= skip_size;
> +
> +		if (!skip_rem)
> +			break;
> +	}
> +}
> +


This implies the skip can not be based on the last child end as the code 
implements.


>   /*
>    * Must be called in a context that synchronizes against this decoder's
>    * port ->remove() callback (like an endpoint decoder sysfs attribute)
> @@ -241,7 +265,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>   	skip_start = res->start - cxled->skip;
>   	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
>   	if (cxled->skip)
> -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> +		release_skip(cxlds, skip_start, cxled->skip);
>   	cxled->skip = 0;
>   	cxled->dpa_res = NULL;
>   	put_device(&cxled->cxld.dev);
> @@ -268,6 +292,47 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
>   	__cxl_dpa_release(cxled);
>   }
>   
> +static int request_skip(struct cxl_dev_state *cxlds,
> +			struct cxl_endpoint_decoder *cxled,
> +			const resource_size_t skip_base,
> +			const resource_size_t skip_len)
> +{
> +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> +
> +	for (int i = 0; i < cxlds->nr_partitions; i++) {
> +		const struct resource *part_res = &cxlds->part[i].res;
> +		struct cxl_port *port = cxled_to_port(cxled);
> +		resource_size_t skip_end, skip_size;
> +		struct resource *res;
> +
> +		if (skip_start < part_res->start || skip_start > part_res->end)
> +			continue;
> +
> +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> +		skip_size = skip_end - skip_start + 1;
> +
> +		res = __request_region(&cxlds->dpa_res, skip_start, skip_size,
> +				       dev_name(&cxled->cxld.dev), 0);
> +		if (!res) {
> +			dev_dbg(cxlds->dev,
> +				"decoder%d.%d: failed to reserve skipped space\n",
> +				port->id, cxled->cxld.id);
> +			break;
> +		}
> +		skip_start += skip_size;
> +		skip_rem -= skip_size;
> +		if (!skip_rem)
> +			break;
> +	}
> +
> +	if (skip_rem == 0)
> +		return 0;
> +
> +	release_skip(cxlds, skip_base, skip_len - skip_rem);
> +
> +	return -EBUSY;
> +}
> +
>   static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   			     resource_size_t base, resource_size_t len,
>   			     resource_size_t skipped)
> @@ -277,6 +342,7 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>   	struct device *dev = &port->dev;
>   	struct resource *res;
> +	int rc;
>   
>   	lockdep_assert_held_write(&cxl_dpa_rwsem);
>   
> @@ -305,14 +371,9 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   	}
>   
>   	if (skipped) {
> -		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
> -				       dev_name(&cxled->cxld.dev), 0);
> -		if (!res) {
> -			dev_dbg(dev,
> -				"decoder%d.%d: failed to reserve skipped space\n",
> -				port->id, cxled->cxld.id);
> -			return -EBUSY;
> -		}
> +		rc = request_skip(cxlds, cxled, base - skipped, skipped);
> +		if (rc)
> +			return rc;
>   	}
>   	res = __request_region(&cxlds->dpa_res, base, len,
>   			       dev_name(&cxled->cxld.dev), 0);
> @@ -320,16 +381,15 @@ static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
>   		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
>   			port->id, cxled->cxld.id);
>   		if (skipped)
> -			__release_region(&cxlds->dpa_res, base - skipped,
> -					 skipped);
> +			release_skip(cxlds, base - skipped, skipped);
>   		return -EBUSY;
>   	}
>   	cxled->dpa_res = res;
>   	cxled->skip = skipped;
>   
> -	if (resource_contains(to_pmem_res(cxlds), res))
> +	if (cxl_partition_contains(cxlds, CXL_PARTITION_PMEM, res))
>   		cxled->mode = CXL_DECODER_PMEM;
> -	else if (resource_contains(to_ram_res(cxlds), res))
> +	else if (cxl_partition_contains(cxlds, CXL_PARTITION_RAM, res))
>   		cxled->mode = CXL_DECODER_RAM;
>   	else {
>   		dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n",
> @@ -527,15 +587,13 @@ int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
>   int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>   {
>   	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> -	resource_size_t free_ram_start, free_pmem_start;
>   	struct cxl_port *port = cxled_to_port(cxled);
>   	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>   	struct device *dev = &cxled->cxld.dev;
> -	resource_size_t start, avail, skip;
> +	struct resource *res, *prev = NULL;
> +	resource_size_t start, avail, skip, skip_start;
>   	struct resource *p, *last;
> -	const struct resource *ram_res = to_ram_res(cxlds);
> -	const struct resource *pmem_res = to_pmem_res(cxlds);
> -	int rc;
> +	int part, rc;
>   
>   	down_write(&cxl_dpa_rwsem);
>   	if (cxled->cxld.region) {
> @@ -551,47 +609,54 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
>   		goto out;
>   	}
>   
> -	for (p = ram_res->child, last = NULL; p; p = p->sibling)
> -		last = p;
> -	if (last)
> -		free_ram_start = last->end + 1;
> +	if (cxled->mode == CXL_DECODER_RAM)
> +		part = CXL_PARTITION_RAM;
> +	else if (cxled->mode == CXL_DECODER_PMEM)
> +		part = CXL_PARTITION_PMEM;
>   	else
> -		free_ram_start = ram_res->start;
> +		part = cxlds->nr_partitions;
> +
> +	if (part >= cxlds->nr_partitions) {
> +		dev_dbg(dev, "partition %d not found\n", part);
> +		rc = -EBUSY;
> +		goto out;
> +	}
> +
> +	res = &cxlds->part[part].res;
>   
> -	for (p = pmem_res->child, last = NULL; p; p = p->sibling)
> +	for (p = res->child, last = NULL; p; p = p->sibling)
>   		last = p;
>   	if (last)
> -		free_pmem_start = last->end + 1;
> +		start = last->end + 1;
>   	else
> -		free_pmem_start = pmem_res->start;
> +		start = res->start;
>   


As said above, this is not correct if there are holes due to releases.


> -	if (cxled->mode == CXL_DECODER_RAM) {
> -		start = free_ram_start;
> -		avail = ram_res->end - start + 1;
> -		skip = 0;
> -	} else if (cxled->mode == CXL_DECODER_PMEM) {
> -		resource_size_t skip_start, skip_end;
> -
> -		start = free_pmem_start;
> -		avail = pmem_res->end - start + 1;
> -		skip_start = free_ram_start;
> -
> -		/*
> -		 * If some pmem is already allocated, then that allocation
> -		 * already handled the skip.
> -		 */
> -		if (pmem_res->child &&
> -		    skip_start == pmem_res->child->start)
> -			skip_end = skip_start - 1;
> -		else
> -			skip_end = start - 1;
> -		skip = skip_end - skip_start + 1;
> -	} else {
> -		dev_dbg(dev, "mode not set\n");
> -		rc = -EINVAL;
> -		goto out;
> +	/*
> +	 * To allocate at partition N, a skip needs to be calculated for all
> +	 * unallocated space at lower partitions indices.
> +	 *
> +	 * If a partition has any allocations, the search can end because a
> +	 * previous cxl_dpa_alloc() invocation is assumed to have accounted for
> +	 * all previous partitions.
> +	 */


This is right, but the code below is not because ...


> +	skip_start = CXL_RESOURCE_NONE;
> +	for (int i = part; i; i--) {
> +		prev = &cxlds->part[i - 1].res;
> +		for (p = prev->child, last = NULL; p; p = p->sibling)
> +			last = p;


... holes ...


I think the problem here is we assumed ram and pmem being a child and 
likely some free space, but a device with multiple HDM decoders implies 
potentially several child.

The code supported the case of multiple child but I guess we still had 
in mind the simple case. Otherwise I can not understand all this ...
Dan Williams Jan. 17, 2025, 6:37 p.m. UTC | #3
Jonathan Cameron wrote:
> On Thu, 16 Jan 2025 22:10:50 -0800
> Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM
> > allocations being distinct from RAM allocations in specific ways when in
> > practice the allocation rules are only relative to DPA partition index.
> > 
> > The rules for cxl_dpa_alloc() are:
> > 
> > - allocations can only come from 1 partition
> > 
> > - if allocating at partition-index-N, all free space in partitions less
> >   than partition-index-N must be skipped over
> > 
> > Use the new 'struct cxl_dpa_partition' array to support allocation with
> > an arbitrary number of DPA partitions on the device.
> > 
> > A follow-on patch can go further to cleanup 'enum cxl_decoder_mode'
> > concept and supersede it with looking up the memory properties from
> > partition metadata.
> 
> If we'd move to meta data and these were tightly packed then I'd be fine
> with nr_partitions. Until that step, I find it confusing.
> 
> A few comments inline. This series does bring some advantages though
> at cost of code that needs a bit more documentation at the very least.
> 
> > 
> > Cc: Dave Jiang <dave.jiang@intel.com>
> > Cc: Alejandro Lucero <alucerop@amd.com>
> > Cc: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > ---
> >  drivers/cxl/core/hdm.c |  167 +++++++++++++++++++++++++++++++++---------------
> >  drivers/cxl/cxlmem.h   |    9 +++
> >  2 files changed, 125 insertions(+), 51 deletions(-)
> > 
> > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > index 7e1559b3ed88..4a2816102a1e 100644
> > --- a/drivers/cxl/core/hdm.c
> > +++ b/drivers/cxl/core/hdm.c
> > @@ -223,6 +223,30 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
> >  }
> >  EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
> >  
> 
> Some documentation would be useful. I'm not sure I understand
> this algorithm correctly.
> 
> I think this complexity is all about ensuring that skip regions have
> their resources broken up on partition boundaries?
> 
> Can we potentially relax constraints a little more to make this
> easier to read by not caring on the ordering?  Find overlap
> of skip region with any partition and remove that bit unconditionally.
> 
> > +static void release_skip(struct cxl_dev_state *cxlds,
> > +			 const resource_size_t skip_base,
> > +			 const resource_size_t skip_len)
> > +{
> > +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> > +
> > +	for (int i = 0; i < cxlds->nr_partitions; i++) {
> > +		const struct resource *part_res = &cxlds->part[i].res;
> > +		resource_size_t skip_end, skip_size;
> > +
> > +		if (skip_start < part_res->start || skip_start > part_res->end)
> > +			continue;
> > +
> > +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> > +		skip_size = skip_end - skip_start + 1;
> > +		__release_region(&cxlds->dpa_res, skip_start, skip_size);
> > +		skip_start += skip_size;
> > +		skip_rem -= skip_size;
> > +
> > +		if (!skip_rem)
> > +			break;
> > +	}
> > +}
> > +
> >  /*
> >   * Must be called in a context that synchronizes against this decoder's
> >   * port ->remove() callback (like an endpoint decoder sysfs attribute)
> > @@ -241,7 +265,7 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >  	skip_start = res->start - cxled->skip;
> >  	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
> >  	if (cxled->skip)
> > -		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
> > +		release_skip(cxlds, skip_start, cxled->skip);
> >  	cxled->skip = 0;
> >  	cxled->dpa_res = NULL;
> >  	put_device(&cxled->cxld.dev);
> > @@ -268,6 +292,47 @@ static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
> >  	__cxl_dpa_release(cxled);
> >  }
> >  
> > +static int request_skip(struct cxl_dev_state *cxlds,
> > +			struct cxl_endpoint_decoder *cxled,
> > +			const resource_size_t skip_base,
> > +			const resource_size_t skip_len)
> > +{
> > +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> > +
> > +	for (int i = 0; i < cxlds->nr_partitions; i++) {
> 
> Likewise, if we relax a constraint on ordering can we make this simpler?
> Would just need to keep track on whether we had reserved enough. I'm not
> 100% sure that is sufficient for the final error check.
> 
> > +		const struct resource *part_res = &cxlds->part[i].res;
> > +		struct cxl_port *port = cxled_to_port(cxled);
> > +		resource_size_t skip_end, skip_size;
> > +		struct resource *res;
> > +
> > +		if (skip_start < part_res->start || skip_start > part_res->end)
> > +			continue;
> > +
> > +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> > +		skip_size = skip_end - skip_start + 1;
> > +
> > +		res = __request_region(&cxlds->dpa_res, skip_start, skip_size,
> > +				       dev_name(&cxled->cxld.dev), 0);
> > +		if (!res) {
> > +			dev_dbg(cxlds->dev,
> > +				"decoder%d.%d: failed to reserve skipped space\n",
> > +				port->id, cxled->cxld.id);
> > +			break;
> > +		}
> > +		skip_start += skip_size;
> > +		skip_rem -= skip_size;
> > +		if (!skip_rem)
> > +			break;
> > +	}
> > +
> > +	if (skip_rem == 0)
> > +		return 0;
> > +
> > +	release_skip(cxlds, skip_base, skip_len - skip_rem);
> Ah, this complicates possibility of relaxations as we'd need to pass in what
> partion number we'd reached when fail occurred.
> Maybe this is the best algorithm, but I'd definitely like docs for this
> function to make it clear what it's assumptions are (paritions in order of DPA etc)

Yes, the software requirements and assumptions for tracking "DPABase"
deserve to be called out in comments for these helpers. Those "DPABase"
tracking assumptions are spelled out in the implementation note in
"8.2.4.19.13 Decoder Protection".
Dan Williams Jan. 17, 2025, 8:57 p.m. UTC | #4
Alejandro Lucero Palau wrote:
> 
> On 1/17/25 06:10, Dan Williams wrote:
> > cxl_dpa_alloc() is a hard coded nest of assumptions around PMEM
> > allocations being distinct from RAM allocations in specific ways when in
> > practice the allocation rules are only relative to DPA partition index.
> >
> > The rules for cxl_dpa_alloc() are:
> >
> > - allocations can only come from 1 partition
> >
> > - if allocating at partition-index-N, all free space in partitions less
> >    than partition-index-N must be skipped over
> 
> 
> In my view, you are mixing the current code with the new code in this 
> explanation. It would be better to say the current code assumption is 
> just two partitions, ram and pmem, but DCD changes the game.

There is no mixture in that description. The rules have not changed from
old to new, the implementation is updated to reflect that the algorithm
never needed to consider ram and pmem explicitly.

> > Use the new 'struct cxl_dpa_partition' array to support allocation with
> > an arbitrary number of DPA partitions on the device.
> >
> > A follow-on patch can go further to cleanup 'enum cxl_decoder_mode'
> > concept and supersede it with looking up the memory properties from
> > partition metadata.
> >
> > Cc: Dave Jiang <dave.jiang@intel.com>
> > Cc: Alejandro Lucero <alucerop@amd.com>
> > Cc: Ira Weiny <ira.weiny@intel.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > ---
> >   drivers/cxl/core/hdm.c |  167 +++++++++++++++++++++++++++++++++---------------
> >   drivers/cxl/cxlmem.h   |    9 +++
> >   2 files changed, 125 insertions(+), 51 deletions(-)
> >
> > diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> > index 7e1559b3ed88..4a2816102a1e 100644
> > --- a/drivers/cxl/core/hdm.c
> > +++ b/drivers/cxl/core/hdm.c
> > @@ -223,6 +223,30 @@ void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
> >   }
> >   EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
> >   
> > +static void release_skip(struct cxl_dev_state *cxlds,
> > +			 const resource_size_t skip_base,
> > +			 const resource_size_t skip_len)
> > +{
> > +	resource_size_t skip_start = skip_base, skip_rem = skip_len;
> > +
> > +	for (int i = 0; i < cxlds->nr_partitions; i++) {
> > +		const struct resource *part_res = &cxlds->part[i].res;
> > +		resource_size_t skip_end, skip_size;
> > +
> > +		if (skip_start < part_res->start || skip_start > part_res->end)
> > +			continue;
> > +
> > +		skip_end = min(part_res->end, skip_start + skip_rem - 1);
> > +		skip_size = skip_end - skip_start + 1;
> > +		__release_region(&cxlds->dpa_res, skip_start, skip_size);
> > +		skip_start += skip_size;
> > +		skip_rem -= skip_size;
> > +
> > +		if (!skip_rem)
> > +			break;
> > +	}
> > +}
> > +
> 
> 
> This implies the skip can not be based on the last child end as the code 
> implements.

I do not follow this comment... more below.

[..]
> > @@ -551,47 +609,54 @@ int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
> >   		goto out;
> >   	}
> >   
> > -	for (p = ram_res->child, last = NULL; p; p = p->sibling)
> > -		last = p;
> > -	if (last)
> > -		free_ram_start = last->end + 1;
> > +	if (cxled->mode == CXL_DECODER_RAM)
> > +		part = CXL_PARTITION_RAM;
> > +	else if (cxled->mode == CXL_DECODER_PMEM)
> > +		part = CXL_PARTITION_PMEM;
> >   	else
> > -		free_ram_start = ram_res->start;
> > +		part = cxlds->nr_partitions;
> > +
> > +	if (part >= cxlds->nr_partitions) {
> > +		dev_dbg(dev, "partition %d not found\n", part);
> > +		rc = -EBUSY;
> > +		goto out;
> > +	}
> > +
> > +	res = &cxlds->part[part].res;
> >   
> > -	for (p = pmem_res->child, last = NULL; p; p = p->sibling)
> > +	for (p = res->child, last = NULL; p; p = p->sibling)
> >   		last = p;
> >   	if (last)
> > -		free_pmem_start = last->end + 1;
> > +		start = last->end + 1;
> >   	else
> > -		free_pmem_start = pmem_res->start;
> > +		start = res->start;
> >   
> 
> 
> As said above, this is not correct if there are holes due to releases.

There are no holes introduced by releases. The skip is always one
contiguous range, it just happens to need to be tracked via multiple
entries in the ->dpa_res tree. Jonathan had asked for more commentary on
what is happening request_skip() and release_skip(), I will clarify this
detail about contiguity.

> 
> 
> > -	if (cxled->mode == CXL_DECODER_RAM) {
> > -		start = free_ram_start;
> > -		avail = ram_res->end - start + 1;
> > -		skip = 0;
> > -	} else if (cxled->mode == CXL_DECODER_PMEM) {
> > -		resource_size_t skip_start, skip_end;
> > -
> > -		start = free_pmem_start;
> > -		avail = pmem_res->end - start + 1;
> > -		skip_start = free_ram_start;
> > -
> > -		/*
> > -		 * If some pmem is already allocated, then that allocation
> > -		 * already handled the skip.
> > -		 */
> > -		if (pmem_res->child &&
> > -		    skip_start == pmem_res->child->start)
> > -			skip_end = skip_start - 1;
> > -		else
> > -			skip_end = start - 1;
> > -		skip = skip_end - skip_start + 1;
> > -	} else {
> > -		dev_dbg(dev, "mode not set\n");
> > -		rc = -EINVAL;
> > -		goto out;
> > +	/*
> > +	 * To allocate at partition N, a skip needs to be calculated for all
> > +	 * unallocated space at lower partitions indices.
> > +	 *
> > +	 * If a partition has any allocations, the search can end because a
> > +	 * previous cxl_dpa_alloc() invocation is assumed to have accounted for
> > +	 * all previous partitions.
> > +	 */
> 
> 
> This is right, but the code below is not because ...
> 
> 
> > +	skip_start = CXL_RESOURCE_NONE;
> > +	for (int i = part; i; i--) {
> > +		prev = &cxlds->part[i - 1].res;
> > +		for (p = prev->child, last = NULL; p; p = p->sibling)
> > +			last = p;
> 
> 
> ... holes ...
> 
> 
> I think the problem here is we assumed ram and pmem being a child and 
> likely some free space, but a device with multiple HDM decoders implies 
> potentially several child.
> 
> The code supported the case of multiple child but I guess we still had 
> in mind the simple case. Otherwise I can not understand all this ...

Holes are not allowed. If you want to delete any decoder capacity you
need to tear down all higher DPA allocations. That is a constraint of
the hardware definition around how software can assume the value of
DPABase. Will add some comments to that effect.
diff mbox series

Patch

diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 7e1559b3ed88..4a2816102a1e 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -223,6 +223,30 @@  void cxl_dpa_debug(struct seq_file *file, struct cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, "CXL");
 
+static void release_skip(struct cxl_dev_state *cxlds,
+			 const resource_size_t skip_base,
+			 const resource_size_t skip_len)
+{
+	resource_size_t skip_start = skip_base, skip_rem = skip_len;
+
+	for (int i = 0; i < cxlds->nr_partitions; i++) {
+		const struct resource *part_res = &cxlds->part[i].res;
+		resource_size_t skip_end, skip_size;
+
+		if (skip_start < part_res->start || skip_start > part_res->end)
+			continue;
+
+		skip_end = min(part_res->end, skip_start + skip_rem - 1);
+		skip_size = skip_end - skip_start + 1;
+		__release_region(&cxlds->dpa_res, skip_start, skip_size);
+		skip_start += skip_size;
+		skip_rem -= skip_size;
+
+		if (!skip_rem)
+			break;
+	}
+}
+
 /*
  * Must be called in a context that synchronizes against this decoder's
  * port ->remove() callback (like an endpoint decoder sysfs attribute)
@@ -241,7 +265,7 @@  static void __cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 	skip_start = res->start - cxled->skip;
 	__release_region(&cxlds->dpa_res, res->start, resource_size(res));
 	if (cxled->skip)
-		__release_region(&cxlds->dpa_res, skip_start, cxled->skip);
+		release_skip(cxlds, skip_start, cxled->skip);
 	cxled->skip = 0;
 	cxled->dpa_res = NULL;
 	put_device(&cxled->cxld.dev);
@@ -268,6 +292,47 @@  static void devm_cxl_dpa_release(struct cxl_endpoint_decoder *cxled)
 	__cxl_dpa_release(cxled);
 }
 
+static int request_skip(struct cxl_dev_state *cxlds,
+			struct cxl_endpoint_decoder *cxled,
+			const resource_size_t skip_base,
+			const resource_size_t skip_len)
+{
+	resource_size_t skip_start = skip_base, skip_rem = skip_len;
+
+	for (int i = 0; i < cxlds->nr_partitions; i++) {
+		const struct resource *part_res = &cxlds->part[i].res;
+		struct cxl_port *port = cxled_to_port(cxled);
+		resource_size_t skip_end, skip_size;
+		struct resource *res;
+
+		if (skip_start < part_res->start || skip_start > part_res->end)
+			continue;
+
+		skip_end = min(part_res->end, skip_start + skip_rem - 1);
+		skip_size = skip_end - skip_start + 1;
+
+		res = __request_region(&cxlds->dpa_res, skip_start, skip_size,
+				       dev_name(&cxled->cxld.dev), 0);
+		if (!res) {
+			dev_dbg(cxlds->dev,
+				"decoder%d.%d: failed to reserve skipped space\n",
+				port->id, cxled->cxld.id);
+			break;
+		}
+		skip_start += skip_size;
+		skip_rem -= skip_size;
+		if (!skip_rem)
+			break;
+	}
+
+	if (skip_rem == 0)
+		return 0;
+
+	release_skip(cxlds, skip_base, skip_len - skip_rem);
+
+	return -EBUSY;
+}
+
 static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 			     resource_size_t base, resource_size_t len,
 			     resource_size_t skipped)
@@ -277,6 +342,7 @@  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct device *dev = &port->dev;
 	struct resource *res;
+	int rc;
 
 	lockdep_assert_held_write(&cxl_dpa_rwsem);
 
@@ -305,14 +371,9 @@  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 	}
 
 	if (skipped) {
-		res = __request_region(&cxlds->dpa_res, base - skipped, skipped,
-				       dev_name(&cxled->cxld.dev), 0);
-		if (!res) {
-			dev_dbg(dev,
-				"decoder%d.%d: failed to reserve skipped space\n",
-				port->id, cxled->cxld.id);
-			return -EBUSY;
-		}
+		rc = request_skip(cxlds, cxled, base - skipped, skipped);
+		if (rc)
+			return rc;
 	}
 	res = __request_region(&cxlds->dpa_res, base, len,
 			       dev_name(&cxled->cxld.dev), 0);
@@ -320,16 +381,15 @@  static int __cxl_dpa_reserve(struct cxl_endpoint_decoder *cxled,
 		dev_dbg(dev, "decoder%d.%d: failed to reserve allocation\n",
 			port->id, cxled->cxld.id);
 		if (skipped)
-			__release_region(&cxlds->dpa_res, base - skipped,
-					 skipped);
+			release_skip(cxlds, base - skipped, skipped);
 		return -EBUSY;
 	}
 	cxled->dpa_res = res;
 	cxled->skip = skipped;
 
-	if (resource_contains(to_pmem_res(cxlds), res))
+	if (cxl_partition_contains(cxlds, CXL_PARTITION_PMEM, res))
 		cxled->mode = CXL_DECODER_PMEM;
-	else if (resource_contains(to_ram_res(cxlds), res))
+	else if (cxl_partition_contains(cxlds, CXL_PARTITION_RAM, res))
 		cxled->mode = CXL_DECODER_RAM;
 	else {
 		dev_warn(dev, "decoder%d.%d: %pr does not map any partition\n",
@@ -527,15 +587,13 @@  int cxl_dpa_set_mode(struct cxl_endpoint_decoder *cxled,
 int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
 {
 	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
-	resource_size_t free_ram_start, free_pmem_start;
 	struct cxl_port *port = cxled_to_port(cxled);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
 	struct device *dev = &cxled->cxld.dev;
-	resource_size_t start, avail, skip;
+	struct resource *res, *prev = NULL;
+	resource_size_t start, avail, skip, skip_start;
 	struct resource *p, *last;
-	const struct resource *ram_res = to_ram_res(cxlds);
-	const struct resource *pmem_res = to_pmem_res(cxlds);
-	int rc;
+	int part, rc;
 
 	down_write(&cxl_dpa_rwsem);
 	if (cxled->cxld.region) {
@@ -551,47 +609,54 @@  int cxl_dpa_alloc(struct cxl_endpoint_decoder *cxled, unsigned long long size)
 		goto out;
 	}
 
-	for (p = ram_res->child, last = NULL; p; p = p->sibling)
-		last = p;
-	if (last)
-		free_ram_start = last->end + 1;
+	if (cxled->mode == CXL_DECODER_RAM)
+		part = CXL_PARTITION_RAM;
+	else if (cxled->mode == CXL_DECODER_PMEM)
+		part = CXL_PARTITION_PMEM;
 	else
-		free_ram_start = ram_res->start;
+		part = cxlds->nr_partitions;
+
+	if (part >= cxlds->nr_partitions) {
+		dev_dbg(dev, "partition %d not found\n", part);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	res = &cxlds->part[part].res;
 
-	for (p = pmem_res->child, last = NULL; p; p = p->sibling)
+	for (p = res->child, last = NULL; p; p = p->sibling)
 		last = p;
 	if (last)
-		free_pmem_start = last->end + 1;
+		start = last->end + 1;
 	else
-		free_pmem_start = pmem_res->start;
+		start = res->start;
 
-	if (cxled->mode == CXL_DECODER_RAM) {
-		start = free_ram_start;
-		avail = ram_res->end - start + 1;
-		skip = 0;
-	} else if (cxled->mode == CXL_DECODER_PMEM) {
-		resource_size_t skip_start, skip_end;
-
-		start = free_pmem_start;
-		avail = pmem_res->end - start + 1;
-		skip_start = free_ram_start;
-
-		/*
-		 * If some pmem is already allocated, then that allocation
-		 * already handled the skip.
-		 */
-		if (pmem_res->child &&
-		    skip_start == pmem_res->child->start)
-			skip_end = skip_start - 1;
-		else
-			skip_end = start - 1;
-		skip = skip_end - skip_start + 1;
-	} else {
-		dev_dbg(dev, "mode not set\n");
-		rc = -EINVAL;
-		goto out;
+	/*
+	 * To allocate at partition N, a skip needs to be calculated for all
+	 * unallocated space at lower partitions indices.
+	 *
+	 * If a partition has any allocations, the search can end because a
+	 * previous cxl_dpa_alloc() invocation is assumed to have accounted for
+	 * all previous partitions.
+	 */
+	skip_start = CXL_RESOURCE_NONE;
+	for (int i = part; i; i--) {
+		prev = &cxlds->part[i - 1].res;
+		for (p = prev->child, last = NULL; p; p = p->sibling)
+			last = p;
+		if (last) {
+			skip_start = last->end + 1;
+			break;
+		}
+		skip_start = prev->start;
 	}
 
+	avail = res->end - start + 1;
+	if (skip_start == CXL_RESOURCE_NONE)
+		skip = 0;
+	else
+		skip = res->start - skip_start;
+
 	if (size > avail) {
 		dev_dbg(dev, "%pa exceeds available %s capacity: %pa\n", &size,
 			cxl_decoder_mode_name(cxled->mode), &avail);
diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 2e728d4b7327..43acd48b300f 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -515,6 +515,15 @@  static inline resource_size_t cxl_pmem_size(struct cxl_dev_state *cxlds)
 	return resource_size(res);
 }
 
+static inline bool cxl_partition_contains(struct cxl_dev_state *cxlds,
+					  unsigned int part,
+					  struct resource *res)
+{
+	if (part >= cxlds->nr_partitions)
+		return false;
+	return resource_contains(&cxlds->part[part].res, res);
+}
+
 static inline struct cxl_dev_state *mbox_to_cxlds(struct cxl_mailbox *cxl_mbox)
 {
 	return dev_get_drvdata(cxl_mbox->host);