diff mbox series

[RFC,v2,14/18] dax/region: Support DAX device creation on dynamic DAX regions

Message ID 20230604-dcd-type2-upstream-v2-14-f740c47e7916@intel.com
State New, archived
Headers show
Series DCD: Add support for Dynamic Capacity Devices (DCD) | expand

Commit Message

Ira Weiny Aug. 29, 2023, 5:21 a.m. UTC
Dynamic Capacity (DC) DAX regions have a list of extents which define
the memory of the region which is available.

Now that DAX region extents are fully realized support DAX device
creation on dynamic regions by adjusting the allocation algorithms
to account for the extents.  Remember also references must be held on
the extents until the DAX devices are done with the memory.

Redefine the region available size to include only extent space.  Reuse
the size allocation algorithm by defining sub-resources for each extent
and limiting range allocation to those extents which have space.  Do not
support direct mapping of DAX devices on dynamic devices.

Enhance DAX device range objects to hold references on the extents until
the DAX device is destroyed.

NOTE: At this time all extents within a region are created equally.
However, labels are associated with extents which can be used with
future DAX device labels to group which extents are used.

Signed-off-by: Ira Weiny <ira.weiny@intel.com>
---
 drivers/dax/bus.c         | 157 +++++++++++++++++++++++++++++++++++++++-------
 drivers/dax/cxl.c         |  44 +++++++++++++
 drivers/dax/dax-private.h |   5 ++
 3 files changed, 182 insertions(+), 24 deletions(-)

Comments

Jonathan Cameron Aug. 30, 2023, 11:50 a.m. UTC | #1
On Mon, 28 Aug 2023 22:21:05 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> Dynamic Capacity (DC) DAX regions have a list of extents which define
> the memory of the region which is available.
> 
> Now that DAX region extents are fully realized support DAX device
> creation on dynamic regions by adjusting the allocation algorithms
> to account for the extents.  Remember also references must be held on
> the extents until the DAX devices are done with the memory.
> 
> Redefine the region available size to include only extent space.  Reuse
> the size allocation algorithm by defining sub-resources for each extent
> and limiting range allocation to those extents which have space.  Do not
> support direct mapping of DAX devices on dynamic devices.
> 
> Enhance DAX device range objects to hold references on the extents until
> the DAX device is destroyed.
> 
> NOTE: At this time all extents within a region are created equally.
> However, labels are associated with extents which can be used with
> future DAX device labels to group which extents are used.

This sound like a bad place to start to me as we are enabling something
that is probably 'wrong' in the long term as opposed to just not enabling it
until we have appropriate support.
I'd argue better to just reject any extents with different labels for now.

As this is an RFC meh ;)

Whilst this looks fine to me, I'm rather out of my depth wrt to the DAX
side of things so take that with a pinch of salt.

Jonathan


> 
> Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> ---
>  drivers/dax/bus.c         | 157 +++++++++++++++++++++++++++++++++++++++-------
>  drivers/dax/cxl.c         |  44 +++++++++++++
>  drivers/dax/dax-private.h |   5 ++
>  3 files changed, 182 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index ea7ae82b4687..a9ea6a706702 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c

...


> @@ -1183,7 +1290,7 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
>  	to_alloc = range_len(&r);
>  	if (alloc_is_aligned(dev_dax, to_alloc))
>  		rc = alloc_dev_dax_range(&dax_region->res, dev_dax, r.start,
> -					 to_alloc);
> +					 to_alloc, NULL);
>  	device_unlock(dev);
>  	device_unlock(dax_region->dev);
>  
> @@ -1400,8 +1507,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
>  	device_initialize(dev);
>  	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
>  
> +	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
> +		      "Dynamic DAX devices are created initially with 0 size");

dev_info() maybe more appropriate?   Is this common enough that we need the
_ONCE?


>  	rc = alloc_dev_dax_range(&dax_region->res, dev_dax, dax_region->res.start,
> -				 data->size);
> +				 data->size, NULL);
>  	if (rc)
>  		goto err_range;
>  
> diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> index 44cbd28668f1..6394a3531e25 100644
> --- a/drivers/dax/cxl.c
> +++ b/drivers/dax/cxl.c
...


>  static int cxl_dax_region_create_extent(struct dax_region *dax_region,
>  					struct cxl_dr_extent *cxl_dr_ext)
>  {
> @@ -45,11 +80,20 @@ static int cxl_dax_region_create_extent(struct dax_region *dax_region,
>  	/* device manages the dr_extent on success */
>  	kref_init(&dr_extent->ref);
>  
> +	rc = dax_region_add_resource(dax_region, dr_extent,
> +				     cxl_dr_ext->hpa_offset,
> +				     cxl_dr_ext->hpa_length);
> +	if (rc) {
> +		kfree(dr_extent);

goto for these and single unwinding block?

> +		return rc;
> +	}
> +
>  	rc = dax_region_ext_create_dev(dax_region, dr_extent,
>  				       cxl_dr_ext->hpa_offset,
>  				       cxl_dr_ext->hpa_length,
>  				       cxl_dr_ext->label);
>  	if (rc) {
> +		dax_region_rm_resource(dr_extent);
>  		kfree(dr_extent);
as above.

>  		return rc;
>  	}
> diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
> index 250babd6e470..ad73b53aa802 100644
> --- a/drivers/dax/dax-private.h
> +++ b/drivers/dax/dax-private.h
> @@ -44,12 +44,16 @@ struct dax_region {
>  /*
>   * struct dax_region_extent - extent data defined by the low level region
>   * driver.
> + * @region: cache of dax_region
> + * @res: cache of resource tree for this extent
>   * @private_data: lower level region driver data

Not sure 'lower level' is well defined here. Is "region driver data"
not enough?

>   * @ref: track number of dax devices which are using this extent
>   * @get: get reference to low level data
>   * @put: put reference to low level data
>   */
>  struct dax_region_extent {
> +	struct dax_region *region;
> +	struct resource *res;
>  	void *private_data;
>  	struct kref ref;
>  	void (*get)(struct dax_region_extent *dr_extent);
> @@ -131,6 +135,7 @@ struct dev_dax {
>  		unsigned long pgoff;
>  		struct range range;
>  		struct dax_mapping *mapping;
> +		struct dax_region_extent *dr_extent;

Huh. Seems that ranges is in the kernel doc but not the
bits that make that up.  Maybe good to add the docs
whilst here?

>  	} *ranges;
>  };
>  
>
Ira Weiny Sept. 6, 2023, 4:35 a.m. UTC | #2
Jonathan Cameron wrote:
> On Mon, 28 Aug 2023 22:21:05 -0700
> Ira Weiny <ira.weiny@intel.com> wrote:
> 
> > Dynamic Capacity (DC) DAX regions have a list of extents which define
> > the memory of the region which is available.
> > 
> > Now that DAX region extents are fully realized support DAX device
> > creation on dynamic regions by adjusting the allocation algorithms
> > to account for the extents.  Remember also references must be held on
> > the extents until the DAX devices are done with the memory.
> > 
> > Redefine the region available size to include only extent space.  Reuse
> > the size allocation algorithm by defining sub-resources for each extent
> > and limiting range allocation to those extents which have space.  Do not
> > support direct mapping of DAX devices on dynamic devices.
> > 
> > Enhance DAX device range objects to hold references on the extents until
> > the DAX device is destroyed.
> > 
> > NOTE: At this time all extents within a region are created equally.
> > However, labels are associated with extents which can be used with
> > future DAX device labels to group which extents are used.
> 
> This sound like a bad place to start to me as we are enabling something
> that is probably 'wrong' in the long term as opposed to just not enabling it
> until we have appropriate support.

I disagree.  I don't think the kernel should be trying to process tags at
the lower level.

> I'd argue better to just reject any extents with different labels for now.

Again I disagree.  This is less restrictive.  The idea is that labels can
be changed such that user space can ultimately decided which extents
should be used for which devices.  I have some work on that already.
(Basically it becomes quite easy to assign a label to a dax device and
have the extent search use only dax extents which match that label.)

> 
> As this is an RFC meh ;)

Sure!  ;-)

> 
> Whilst this looks fine to me, I'm rather out of my depth wrt to the DAX
> side of things so take that with a pinch of salt.

NP

> 
> Jonathan
> 
> 
> > 
> > Signed-off-by: Ira Weiny <ira.weiny@intel.com>
> > ---
> >  drivers/dax/bus.c         | 157 +++++++++++++++++++++++++++++++++++++++-------
> >  drivers/dax/cxl.c         |  44 +++++++++++++
> >  drivers/dax/dax-private.h |   5 ++
> >  3 files changed, 182 insertions(+), 24 deletions(-)
> > 
> > diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> > index ea7ae82b4687..a9ea6a706702 100644
> > --- a/drivers/dax/bus.c
> > +++ b/drivers/dax/bus.c
> 
> ...
> 
> 
> > @@ -1183,7 +1290,7 @@ static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
> >  	to_alloc = range_len(&r);
> >  	if (alloc_is_aligned(dev_dax, to_alloc))
> >  		rc = alloc_dev_dax_range(&dax_region->res, dev_dax, r.start,
> > -					 to_alloc);
> > +					 to_alloc, NULL);
> >  	device_unlock(dev);
> >  	device_unlock(dax_region->dev);
> >  
> > @@ -1400,8 +1507,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
> >  	device_initialize(dev);
> >  	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
> >  
> > +	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
> > +		      "Dynamic DAX devices are created initially with 0 size");
> 
> dev_info() maybe more appropriate?

Unless I'm mistaken this can happen from userspace but only if something
in the code changes later.  Because the dax layer is trying to support
non-dynamic regions (which dynamic may be a bad name), I was worried that
the creation with a size might slip through...

> Is this common enough that we need the
> _ONCE?

once is because it could end up spamming a log later if something got
coded up wrong.

> 
> 
> >  	rc = alloc_dev_dax_range(&dax_region->res, dev_dax, dax_region->res.start,
> > -				 data->size);
> > +				 data->size, NULL);
> >  	if (rc)
> >  		goto err_range;
> >  
> > diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
> > index 44cbd28668f1..6394a3531e25 100644
> > --- a/drivers/dax/cxl.c
> > +++ b/drivers/dax/cxl.c
> ...
> 
> 
> >  static int cxl_dax_region_create_extent(struct dax_region *dax_region,
> >  					struct cxl_dr_extent *cxl_dr_ext)
> >  {
> > @@ -45,11 +80,20 @@ static int cxl_dax_region_create_extent(struct dax_region *dax_region,
> >  	/* device manages the dr_extent on success */
> >  	kref_init(&dr_extent->ref);
> >  
> > +	rc = dax_region_add_resource(dax_region, dr_extent,
> > +				     cxl_dr_ext->hpa_offset,
> > +				     cxl_dr_ext->hpa_length);
> > +	if (rc) {
> > +		kfree(dr_extent);
> 
> goto for these and single unwinding block?

Yea.  Done.

> 
> > +		return rc;
> > +	}
> > +
> >  	rc = dax_region_ext_create_dev(dax_region, dr_extent,
> >  				       cxl_dr_ext->hpa_offset,
> >  				       cxl_dr_ext->hpa_length,
> >  				       cxl_dr_ext->label);
> >  	if (rc) {
> > +		dax_region_rm_resource(dr_extent);
> >  		kfree(dr_extent);
> as above.

Done.

> 
> >  		return rc;
> >  	}
> > diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
> > index 250babd6e470..ad73b53aa802 100644
> > --- a/drivers/dax/dax-private.h
> > +++ b/drivers/dax/dax-private.h
> > @@ -44,12 +44,16 @@ struct dax_region {
> >  /*
> >   * struct dax_region_extent - extent data defined by the low level region
> >   * driver.
> > + * @region: cache of dax_region
> > + * @res: cache of resource tree for this extent
> >   * @private_data: lower level region driver data
> 
> Not sure 'lower level' is well defined here. Is "region driver data"
> not enough?

For me it was not.  I'll have to sleep on it.  Technically there is no
dax_region 'driver' but only a dax_region device.

> 
> >   * @ref: track number of dax devices which are using this extent
> >   * @get: get reference to low level data
> >   * @put: put reference to low level data
> >   */
> >  struct dax_region_extent {
> > +	struct dax_region *region;
> > +	struct resource *res;
> >  	void *private_data;
> >  	struct kref ref;
> >  	void (*get)(struct dax_region_extent *dr_extent);
> > @@ -131,6 +135,7 @@ struct dev_dax {
> >  		unsigned long pgoff;
> >  		struct range range;
> >  		struct dax_mapping *mapping;
> > +		struct dax_region_extent *dr_extent;
> 
> Huh. Seems that ranges is in the kernel doc but not the
> bits that make that up.  Maybe good to add the docs
> whilst here?

oh.  sure.  took me a couple of reads of this sentence.

I'm going to think on this too.

Ira

> 
> >  	} *ranges;
> >  };
> >  
> > 
>
Jonathan Cameron Sept. 12, 2023, 4:49 p.m. UTC | #3
On Tue, 5 Sep 2023 21:35:03 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> Jonathan Cameron wrote:
> > On Mon, 28 Aug 2023 22:21:05 -0700
> > Ira Weiny <ira.weiny@intel.com> wrote:
> >   
> > > Dynamic Capacity (DC) DAX regions have a list of extents which define
> > > the memory of the region which is available.
> > > 
> > > Now that DAX region extents are fully realized support DAX device
> > > creation on dynamic regions by adjusting the allocation algorithms
> > > to account for the extents.  Remember also references must be held on
> > > the extents until the DAX devices are done with the memory.
> > > 
> > > Redefine the region available size to include only extent space.  Reuse
> > > the size allocation algorithm by defining sub-resources for each extent
> > > and limiting range allocation to those extents which have space.  Do not
> > > support direct mapping of DAX devices on dynamic devices.
> > > 
> > > Enhance DAX device range objects to hold references on the extents until
> > > the DAX device is destroyed.
> > > 
> > > NOTE: At this time all extents within a region are created equally.
> > > However, labels are associated with extents which can be used with
> > > future DAX device labels to group which extents are used.  
> > 
> > This sound like a bad place to start to me as we are enabling something
> > that is probably 'wrong' in the long term as opposed to just not enabling it
> > until we have appropriate support.  
> 
> I disagree.  I don't think the kernel should be trying to process tags at
> the lower level.
> 
> > I'd argue better to just reject any extents with different labels for now.  
> 
> Again I disagree.  This is less restrictive.  The idea is that labels can
> be changed such that user space can ultimately decided which extents
> should be used for which devices.  I have some work on that already.
> (Basically it becomes quite easy to assign a label to a dax device and
> have the extent search use only dax extents which match that label.)

That sounds good - but if someone expects that and uses it with an old
kernel I'm not sure if it is better to say 'we don't support it yet' or
do something different from a newer kernel.


> > > @@ -1400,8 +1507,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
> > >  	device_initialize(dev);
> > >  	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
> > >  
> > > +	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
> > > +		      "Dynamic DAX devices are created initially with 0 size");  
> > 
> > dev_info() maybe more appropriate?  
> 
> Unless I'm mistaken this can happen from userspace but only if something
> in the code changes later.  Because the dax layer is trying to support
> non-dynamic regions (which dynamic may be a bad name), I was worried that
> the creation with a size might slip through...

Fair enough - if strong chance userspace will control it at somepoitn then
ONCE seems fine.

> 
> > Is this common enough that we need the
> > _ONCE?  
> 
> once is because it could end up spamming a log later if something got
> coded up wrong.

I'm not sure I care about bugs spamming the log.   Only things that
are userspace controlled or likely hardware failures etc.
Ira Weiny Sept. 12, 2023, 10:08 p.m. UTC | #4
Jonathan Cameron wrote:
> On Tue, 5 Sep 2023 21:35:03 -0700
> Ira Weiny <ira.weiny@intel.com> wrote:
> 
> > Jonathan Cameron wrote:
> > > On Mon, 28 Aug 2023 22:21:05 -0700
> > > Ira Weiny <ira.weiny@intel.com> wrote:
> > >   
> > > > Dynamic Capacity (DC) DAX regions have a list of extents which define
> > > > the memory of the region which is available.
> > > > 
> > > > Now that DAX region extents are fully realized support DAX device
> > > > creation on dynamic regions by adjusting the allocation algorithms
> > > > to account for the extents.  Remember also references must be held on
> > > > the extents until the DAX devices are done with the memory.
> > > > 
> > > > Redefine the region available size to include only extent space.  Reuse
> > > > the size allocation algorithm by defining sub-resources for each extent
> > > > and limiting range allocation to those extents which have space.  Do not
> > > > support direct mapping of DAX devices on dynamic devices.
> > > > 
> > > > Enhance DAX device range objects to hold references on the extents until
> > > > the DAX device is destroyed.
> > > > 
> > > > NOTE: At this time all extents within a region are created equally.
> > > > However, labels are associated with extents which can be used with
> > > > future DAX device labels to group which extents are used.  
> > > 
> > > This sound like a bad place to start to me as we are enabling something
> > > that is probably 'wrong' in the long term as opposed to just not enabling it
> > > until we have appropriate support.  
> > 
> > I disagree.  I don't think the kernel should be trying to process tags at
> > the lower level.
> > 
> > > I'd argue better to just reject any extents with different labels for now.  
> > 
> > Again I disagree.  This is less restrictive.  The idea is that labels can
> > be changed such that user space can ultimately decided which extents
> > should be used for which devices.  I have some work on that already.
> > (Basically it becomes quite easy to assign a label to a dax device and
> > have the extent search use only dax extents which match that label.)
> 
> That sounds good - but if someone expects that and uses it with an old
> kernel I'm not sure if it is better to say 'we don't support it yet' or
> do something different from a newer kernel.

This does provide the 'we don't support that yet' in that dax device
creation can't be associated with a label yet.  So surfacing the extents
with the tag as a default label and letting those labels change is more
informational at this point and not functional.  Simple use cases can use
the label (from the tag) to detect that some extent with the wrong tag got
in the region but can't correct it without going through the FM.

It is easy enough to remove the label sysfs and defer that until the dax
device has a label and this support though.

> 
> 
> > > > @@ -1400,8 +1507,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
> > > >  	device_initialize(dev);
> > > >  	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
> > > >  
> > > > +	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
> > > > +		      "Dynamic DAX devices are created initially with 0 size");  
> > > 
> > > dev_info() maybe more appropriate?  
> > 
> > Unless I'm mistaken this can happen from userspace but only if something
> > in the code changes later.  Because the dax layer is trying to support
> > non-dynamic regions (which dynamic may be a bad name), I was worried that
> > the creation with a size might slip through...
> 
> Fair enough - if strong chance userspace will control it at somepoitn then
> ONCE seems fine.
> 
> > 
> > > Is this common enough that we need the
> > > _ONCE?  
> > 
> > once is because it could end up spamming a log later if something got
> > coded up wrong.
> 
> I'm not sure I care about bugs spamming the log.   Only things that
> are userspace controlled or likely hardware failures etc.
> 

Understood.  Let me trace them again but I think these can be triggered by
user space.  If not I'll remove the ONCE.

Thanks again,
Ira
Dan Williams Sept. 12, 2023, 10:35 p.m. UTC | #5
Ira Weiny wrote:
> Jonathan Cameron wrote:
> > On Tue, 5 Sep 2023 21:35:03 -0700
> > Ira Weiny <ira.weiny@intel.com> wrote:
> > 
> > > Jonathan Cameron wrote:
> > > > On Mon, 28 Aug 2023 22:21:05 -0700
> > > > Ira Weiny <ira.weiny@intel.com> wrote:
> > > >   
> > > > > Dynamic Capacity (DC) DAX regions have a list of extents which define
> > > > > the memory of the region which is available.
> > > > > 
> > > > > Now that DAX region extents are fully realized support DAX device
> > > > > creation on dynamic regions by adjusting the allocation algorithms
> > > > > to account for the extents.  Remember also references must be held on
> > > > > the extents until the DAX devices are done with the memory.
> > > > > 
> > > > > Redefine the region available size to include only extent space.  Reuse
> > > > > the size allocation algorithm by defining sub-resources for each extent
> > > > > and limiting range allocation to those extents which have space.  Do not
> > > > > support direct mapping of DAX devices on dynamic devices.
> > > > > 
> > > > > Enhance DAX device range objects to hold references on the extents until
> > > > > the DAX device is destroyed.
> > > > > 
> > > > > NOTE: At this time all extents within a region are created equally.
> > > > > However, labels are associated with extents which can be used with
> > > > > future DAX device labels to group which extents are used.  
> > > > 
> > > > This sound like a bad place to start to me as we are enabling something
> > > > that is probably 'wrong' in the long term as opposed to just not enabling it
> > > > until we have appropriate support.  
> > > 
> > > I disagree.  I don't think the kernel should be trying to process tags at
> > > the lower level.
> > > 
> > > > I'd argue better to just reject any extents with different labels for now.  
> > > 
> > > Again I disagree.  This is less restrictive.  The idea is that labels can
> > > be changed such that user space can ultimately decided which extents
> > > should be used for which devices.  I have some work on that already.
> > > (Basically it becomes quite easy to assign a label to a dax device and
> > > have the extent search use only dax extents which match that label.)
> > 
> > That sounds good - but if someone expects that and uses it with an old
> > kernel I'm not sure if it is better to say 'we don't support it yet' or
> > do something different from a newer kernel.
> 
> This does provide the 'we don't support that yet' in that dax device
> creation can't be associated with a label yet.  So surfacing the extents
> with the tag as a default label and letting those labels change is more
> informational at this point and not functional.  Simple use cases can use
> the label (from the tag) to detect that some extent with the wrong tag got
> in the region but can't correct it without going through the FM.
> 
> It is easy enough to remove the label sysfs and defer that until the dax
> device has a label and this support though.

Catching up on just this point (still need to go through the whole
thing).  A Sparse DAX region is one where the extents need not be
present at DAX region instantiation and may be added/removed later. The
device-dax allocation scheme just takes a size to do a "first-available"
search for free capacity in the region.

Given that one of the expected DCD use cases is to provide just in time
memory for specific jobs the "first-available" search for free capacity
in a Sparse DAX Region collides with the need to keep allocations
bounded by tag.

I agree with Jonathan that unless and until the allocation scheme is
updated to be tag aware then there is no reason for allocate by tag to
exist in the interface.

That said, the next question, "is DCD enabling considered a toy until
the ability to allocate by tag is present?" I think yes, to the point
where old daxctl binaries should be made fail to create device instances
by forcing a tag to be selected at allocation time for Sparse DAX
Regions.

The last question is whether *writable* tags are needed to allow for
repurposing memory allocated to a host without needing to round trip it
through the FM to get it re-tagged. While that is something the host and
orchestrator can figure out on their own, it looks like a nice to have
until the above questions are answered.

> > > > > @@ -1400,8 +1507,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
> > > > >  	device_initialize(dev);
> > > > >  	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
> > > > >  
> > > > > +	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
> > > > > +		      "Dynamic DAX devices are created initially with 0 size");  
> > > > 
> > > > dev_info() maybe more appropriate?  
> > > 
> > > Unless I'm mistaken this can happen from userspace but only if something
> > > in the code changes later.  Because the dax layer is trying to support
> > > non-dynamic regions (which dynamic may be a bad name), I was worried that
> > > the creation with a size might slip through...
> > 
> > Fair enough - if strong chance userspace will control it at somepoitn then
> > ONCE seems fine.
> > 
> > > 
> > > > Is this common enough that we need the
> > > > _ONCE?  
> > > 
> > > once is because it could end up spamming a log later if something got
> > > coded up wrong.
> > 
> > I'm not sure I care about bugs spamming the log.   Only things that
> > are userspace controlled or likely hardware failures etc.
> > 
> 
> Understood.  Let me trace them again but I think these can be triggered by
> user space.  If not I'll remove the ONCE.

Unless this is an unequivocal kernel bug if it fires, and there is a
significant potential for active development to do the wrong thing,
don't leave a panic_on_warn land mine.
Ira Weiny Sept. 13, 2023, 5:30 p.m. UTC | #6
Dan Williams wrote:
> Ira Weiny wrote:
> > Jonathan Cameron wrote:
> > > On Tue, 5 Sep 2023 21:35:03 -0700
> > > Ira Weiny <ira.weiny@intel.com> wrote:
> > > 
> > > > Jonathan Cameron wrote:
> > > > > On Mon, 28 Aug 2023 22:21:05 -0700
> > > > > Ira Weiny <ira.weiny@intel.com> wrote:
> > > > >   
> > > > > > Dynamic Capacity (DC) DAX regions have a list of extents which define
> > > > > > the memory of the region which is available.
> > > > > > 
> > > > > > Now that DAX region extents are fully realized support DAX device
> > > > > > creation on dynamic regions by adjusting the allocation algorithms
> > > > > > to account for the extents.  Remember also references must be held on
> > > > > > the extents until the DAX devices are done with the memory.
> > > > > > 
> > > > > > Redefine the region available size to include only extent space.  Reuse
> > > > > > the size allocation algorithm by defining sub-resources for each extent
> > > > > > and limiting range allocation to those extents which have space.  Do not
> > > > > > support direct mapping of DAX devices on dynamic devices.
> > > > > > 
> > > > > > Enhance DAX device range objects to hold references on the extents until
> > > > > > the DAX device is destroyed.
> > > > > > 
> > > > > > NOTE: At this time all extents within a region are created equally.
> > > > > > However, labels are associated with extents which can be used with
> > > > > > future DAX device labels to group which extents are used.  
> > > > > 
> > > > > This sound like a bad place to start to me as we are enabling something
> > > > > that is probably 'wrong' in the long term as opposed to just not enabling it
> > > > > until we have appropriate support.  
> > > > 
> > > > I disagree.  I don't think the kernel should be trying to process tags at
> > > > the lower level.
> > > > 
> > > > > I'd argue better to just reject any extents with different labels for now.  
> > > > 
> > > > Again I disagree.  This is less restrictive.  The idea is that labels can
> > > > be changed such that user space can ultimately decided which extents
> > > > should be used for which devices.  I have some work on that already.
> > > > (Basically it becomes quite easy to assign a label to a dax device and
> > > > have the extent search use only dax extents which match that label.)
> > > 
> > > That sounds good - but if someone expects that and uses it with an old
> > > kernel I'm not sure if it is better to say 'we don't support it yet' or
> > > do something different from a newer kernel.
> > 
> > This does provide the 'we don't support that yet' in that dax device
> > creation can't be associated with a label yet.  So surfacing the extents
> > with the tag as a default label and letting those labels change is more
> > informational at this point and not functional.  Simple use cases can use
> > the label (from the tag) to detect that some extent with the wrong tag got
> > in the region but can't correct it without going through the FM.
> > 
> > It is easy enough to remove the label sysfs and defer that until the dax
> > device has a label and this support though.
> 
> Catching up on just this point (still need to go through the whole
> thing).  A Sparse DAX region is one where the extents need not be
> present at DAX region instantiation and may be added/removed later. The
> device-dax allocation scheme just takes a size to do a "first-available"
> search for free capacity in the region.

Agreed.  And this is the way things work now.

Also your use of 'Sparse DAX region' seems better than the word 'dynamic'
I have used now.  I know that static regions mean something else but I
could not think of a better word.  I'll make adjustments to the
code/commit messages.

> 
> Given that one of the expected DCD use cases is to provide just in time
> memory for specific jobs the "first-available" search for free capacity
> in a Sparse DAX Region collides with the need to keep allocations
> bounded by tag.

How does it collide?

My attempt here is to leave dax devices 'unlabeled'.  As such they will use
space on a 'first-available' search regardless of extent labels.

Effectively I have defined 'no label' as being 'any label'.  I apologize
for this detail being implicit and not explicit.

My envisioned path would be that older daxctl would continue to work like
this because the kernel would not restrict unlabeled dax device creation.

Newer daxctl could use dax device labels to control the extents used.  But
only when dax device labeling is introduced in a future kernel.  Use of a
newer daxctl on an older DCD kernel could continue to work sans label.

In this way I envisioned a path where the policy is completely dictated by
user space restricted only by the software available.

> 
> I agree with Jonathan that unless and until the allocation scheme is
> updated to be tag aware then there is no reason for allocate by tag to
> exist in the interface.

I will agree that it was perhaps premature to introduce labels on the
extents.  However, I did so to give tags a space to be informationally
surfaced.

IMO we must have a plan forward or wait until that plan is fully formed
and implemented.  The size of this set is rather large.  Therefore, I was
hoping that a plan would be enough to move forward.

> 
> That said, the next question, "is DCD enabling considered a toy until
> the ability to allocate by tag is present?" I think yes, to the point
> where old daxctl binaries should be made fail to create device instances
> by forcing a tag to be selected at allocation time for Sparse DAX
> Regions.

Interesting.  I was not considering allocate by label to be a requirement
but rather an enhancement.  Labels IMO are a further refinement of the
memory space allocation.  I can see a very valid use case (not toy use
case) where all the DCD memory allocated to a node is dedicated to a
singular job and is done without tags or even ignoring tags.  Many HPC
sites run with singular jobs per host.

> 
> The last question is whether *writable* tags are needed to allow for
> repurposing memory allocated to a host without needing to round trip it
> through the FM to get it re-tagged. While that is something the host and
> orchestrator can figure out on their own, it looks like a nice to have
> until the above questions are answered.

Needed?  No.  Of course not.  As you said the orchestrator software can
keep iterating with the FM until it gets what it wants.  It was you who
had the idea of a writable labels and I agreed.

"Seemed like a good idea at the time..."  ;-)

As I have reviewed and rewritten this message I worry that writable labels
are a bad idea.  Interleaving will most likely depend on grouping extent
tags into the CXL/DAX extent.  With this in mind adjusting extents is
potentially going to require an FM interaction to get things set up
anyway.

	[Again re-reading my message I thought of another issue.  What
	happens if the user decides to change the label on an extent after
	some dax device with the old label?  That seems like an additional
	complication which is best left out by not allowing extent labels
	to be writable.]

I think writable labels are orthogonal to the kernel behavior though.
Allowing labels to change after the fact is a policy matter which is not
something the kernel needs to manage.

The kernel does need to manage how it allocates a dax device across the
extents available.  Assigning a dax label and allocating to the extents
matching that label is very straight forward.  The real issue is how to
deal with the 'no label' case.

As a path forward, I made a couple of assumptions.  First was the idea of
'no dax device label' == 'any extent label'.  Second, was that current dax
device creation was done as 'no dax device label'.

In this way I did not see a requirement to fully implement label
restriction on dax devices.  Labels are simply a nice to have thing to
group extents later.  Also, if you want dax devices created with specific
extents you have to assign them a label.  Otherwise they are allocated
'first-available' like they have been in the past.

I see a few ways forward.

One is to define 'no dax device label' as 'any extent label' as I have it
now.  IMO this provides the most backwards compatible dax device creation.
The ndctl region code additions are minimal and there are no daxctl
modifications required at all.

A second is to define 'no dax device label' as 'no extent label' and go
forward with this series but add a restriction on dax device creation to
only extents without a label.  This is still pretty compatible but if tags
are used then some extents would not be available without additional
daxctl modifications.

A third way forward is to fully implement label enabled dax device
creation.  In this case I feel like the direction is to make 'no label' ==
'no label'.  This is not hard but will take a couple more weeks to get the
daxctl code and all the testing.

It warrants mentioning that tags are an optional feature.  I feel like
there is momentum in the community to not use tags initially.  And so I
was targeting an initial implementation which really did not need tags at
all.  Perhaps I am wrong in that assumption?  Or perhaps I was short
sighted (possibly because interleaving becomes more straight forward)?

To summarize I see the following fundamental questions.

	1) Do we require DCD support to require dax device label
	   management?
	2) What does 'no dax device label' mean?
		a) any extent label
		b) no extent label
	3) Should writable labels be allowed on extents?
		a) this is more flexible
		b) security issues?
		c) does it just confuse things with interleaving?
		d) nice to change the tag name to something easy to read?
		e) other issues?
	4) How should the available size for labels be communicated to the
	   user?
	   	a) currently available size reflects an 'any extent label'
		   behavior when there is no label on the dax device.
		b) this becomes an issue if labelless dax devices are
		   restricted to labelless extents.

My current view is:
	1) No.  Current dax devices can be defined as 'no label'
	2) I'm not sure.  I can see both ways having benefits.
	3) No I think the ROI is not worth it.
	4) The use of 'any extent label' in #2 means that available size
	   retains it's meaning for no label dax devices.  Labeled dax
	   devices would require a future enhancement to size information.

> 
> > > > > > @@ -1400,8 +1507,10 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
> > > > > >  	device_initialize(dev);
> > > > > >  	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
> > > > > >  
> > > > > > +	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
> > > > > > +		      "Dynamic DAX devices are created initially with 0 size");  
> > > > > 
> > > > > dev_info() maybe more appropriate?  
> > > > 
> > > > Unless I'm mistaken this can happen from userspace but only if something
> > > > in the code changes later.  Because the dax layer is trying to support
> > > > non-dynamic regions (which dynamic may be a bad name), I was worried that
> > > > the creation with a size might slip through...
> > > 
> > > Fair enough - if strong chance userspace will control it at somepoitn then
> > > ONCE seems fine.
> > > 
> > > > 
> > > > > Is this common enough that we need the
> > > > > _ONCE?  
> > > > 
> > > > once is because it could end up spamming a log later if something got
> > > > coded up wrong.
> > > 
> > > I'm not sure I care about bugs spamming the log.   Only things that
> > > are userspace controlled or likely hardware failures etc.
> > > 
> > 
> > Understood.  Let me trace them again but I think these can be triggered by
> > user space.  If not I'll remove the ONCE.
> 
> Unless this is an unequivocal kernel bug if it fires, and there is a
> significant potential for active development to do the wrong thing,
> don't leave a panic_on_warn land mine.

Indeed.  I forgot about those panic on warn users.  I'll remove the warn
altogether.

Thanks,
Ira
Dan Williams Sept. 13, 2023, 5:59 p.m. UTC | #7
Ira Weiny wrote:
[..]
> > 
> > Given that one of the expected DCD use cases is to provide just in time
> > memory for specific jobs the "first-available" search for free capacity
> > in a Sparse DAX Region collides with the need to keep allocations
> > bounded by tag.
> 
> How does it collide?
> 
> My attempt here is to leave dax devices 'unlabeled'.  As such they will use
> space on a 'first-available' search regardless of extent labels.
> 
> Effectively I have defined 'no label' as being 'any label'.  I apologize
> for this detail being implicit and not explicit.
> 
> My envisioned path would be that older daxctl would continue to work like
> this because the kernel would not restrict unlabeled dax device creation.
> 
> Newer daxctl could use dax device labels to control the extents used.  But
> only when dax device labeling is introduced in a future kernel.  Use of a
> newer daxctl on an older DCD kernel could continue to work sans label.
> 
> In this way I envisioned a path where the policy is completely dictated by
> user space restricted only by the software available.

Tags are a core concept in DCD. "Allocate by tag" does not feel like
something that can come later at least in terms of when the DCD ABI is
ready for upstream. So, yes, it can remain out of this patchset, but the
upstream merge of all of DCD would be gated on that facility arriving.

> > I agree with Jonathan that unless and until the allocation scheme is
> > updated to be tag aware then there is no reason for allocate by tag to
> > exist in the interface.
> 
> I will agree that it was perhaps premature to introduce labels on the
> extents.  However, I did so to give tags a space to be informationally
> surfaced.
> 
> IMO we must have a plan forward or wait until that plan is fully formed
> and implemented.  The size of this set is rather large.  Therefore, I was
> hoping that a plan would be enough to move forward.

Leave it out for now to focus on the core mechanisms and then we can
circle back to it.

> > That said, the next question, "is DCD enabling considered a toy until
> > the ability to allocate by tag is present?" I think yes, to the point
> > where old daxctl binaries should be made fail to create device instances
> > by forcing a tag to be selected at allocation time for Sparse DAX
> > Regions.
> 
> Interesting.  I was not considering allocate by label to be a requirement
> but rather an enhancement.  Labels IMO are a further refinement of the
> memory space allocation.  I can see a very valid use case (not toy use
> case) where all the DCD memory allocated to a node is dedicated to a
> singular job and is done without tags or even ignoring tags.  Many HPC
> sites run with singular jobs per host.

Is HPC going to use DCD? My impression is that HPC is statically
provisioned per node and that DCD is more targeted at Cloud use cases
where dynamic provisioning is common.

> > The last question is whether *writable* tags are needed to allow for
> > repurposing memory allocated to a host without needing to round trip it
> > through the FM to get it re-tagged. While that is something the host and
> > orchestrator can figure out on their own, it looks like a nice to have
> > until the above questions are answered.
> 
> Needed?  No.  Of course not.  As you said the orchestrator software can
> keep iterating with the FM until it gets what it wants.  It was you who
> had the idea of a writable labels and I agreed.

Yeah, it was an idea for how to solve the problem of repurposing tag
without needing to round trip with the FM.

> "Seemed like a good idea at the time..."  ;-)
> 
> As I have reviewed and rewritten this message I worry that writable labels
> are a bad idea.  Interleaving will most likely depend on grouping extent
> tags into the CXL/DAX extent.  With this in mind adjusting extents is
> potentially going to require an FM interaction to get things set up
> anyway.
> 
> 	[Again re-reading my message I thought of another issue.  What
> 	happens if the user decides to change the label on an extent after
> 	some dax device with the old label?  That seems like an additional
> 	complication which is best left out by not allowing extent labels
> 	to be writable.]

At least for this point extents can not be relabeled while allocated to
an instance.

[..]
> My current view is:
> 	1) No.  Current dax devices can be defined as 'no label'
> 	2) I'm not sure.  I can see both ways having benefits.
> 	3) No I think the ROI is not worth it.
> 	4) The use of 'any extent label' in #2 means that available size
> 	   retains it's meaning for no label dax devices.  Labeled dax
> 	   devices would require a future enhancement to size information.

If the ABI is going to change in the future I don't want every debug
session to start with "which version of daxctl were you using", or "do
your scripts comprehend Sparse DAX Regions?". This stance is motivated
by having seen the problems that the current ABI causes for people that want
to do things like mitigate the "noisy neighbor" phenomenon in memory
side caches. The allocation ABI is too simple and DCD seems to need
more.

The kernel enforced requirement for Sparse DAX Region aware tooling just
makes it easier on us to maintain. If it means waiting until we ahve
agreement on the allocation ABI I think that's a simple release valve.

The fundamental mechanisms can be reviewed in the meantime.
Ira Weiny Sept. 13, 2023, 7:26 p.m. UTC | #8
Dan Williams wrote:
> Ira Weiny wrote:
> [..]
> > > 
> > > Given that one of the expected DCD use cases is to provide just in time
> > > memory for specific jobs the "first-available" search for free capacity
> > > in a Sparse DAX Region collides with the need to keep allocations
> > > bounded by tag.
> > 
> > How does it collide?
> > 
> > My attempt here is to leave dax devices 'unlabeled'.  As such they will use
> > space on a 'first-available' search regardless of extent labels.
> > 
> > Effectively I have defined 'no label' as being 'any label'.  I apologize
> > for this detail being implicit and not explicit.
> > 
> > My envisioned path would be that older daxctl would continue to work like
> > this because the kernel would not restrict unlabeled dax device creation.
> > 
> > Newer daxctl could use dax device labels to control the extents used.  But
> > only when dax device labeling is introduced in a future kernel.  Use of a
> > newer daxctl on an older DCD kernel could continue to work sans label.
> > 
> > In this way I envisioned a path where the policy is completely dictated by
> > user space restricted only by the software available.
> 
> Tags are a core concept in DCD. "Allocate by tag" does not feel like
> something that can come later at least in terms of when the DCD ABI is
> ready for upstream. So, yes, it can remain out of this patchset, but the
> upstream merge of all of DCD would be gated on that facility arriving.

I don't see how this can be left out of this patchset.  Without dax device
support on DCD there is no functionality and this patchset does nothing.

> 
> > > I agree with Jonathan that unless and until the allocation scheme is
> > > updated to be tag aware then there is no reason for allocate by tag to
> > > exist in the interface.
> > 
> > I will agree that it was perhaps premature to introduce labels on the
> > extents.  However, I did so to give tags a space to be informationally
> > surfaced.
> > 
> > IMO we must have a plan forward or wait until that plan is fully formed
> > and implemented.  The size of this set is rather large.  Therefore, I was
> > hoping that a plan would be enough to move forward.
> 
> Leave it out for now to focus on the core mechanisms and then we can
       ^^^^
       it what?
> circle back to it.

Again dax devices need to be created to full test this so I have to create
them in some way.  I'm going to assume you mean 'labelless' and deal with
labels later.

> 
> > > That said, the next question, "is DCD enabling considered a toy until
> > > the ability to allocate by tag is present?" I think yes, to the point
> > > where old daxctl binaries should be made fail to create device instances
> > > by forcing a tag to be selected at allocation time for Sparse DAX
> > > Regions.
> > 
> > Interesting.  I was not considering allocate by label to be a requirement
> > but rather an enhancement.  Labels IMO are a further refinement of the
> > memory space allocation.  I can see a very valid use case (not toy use
> > case) where all the DCD memory allocated to a node is dedicated to a
> > singular job and is done without tags or even ignoring tags.  Many HPC
> > sites run with singular jobs per host.
> 
> Is HPC going to use DCD? My impression is that HPC is statically
> provisioned per node and that DCD is more targeted at Cloud use cases
> where dynamic provisioning is common.

I heard someone mention HPC in a call at some point.

> 
> > > The last question is whether *writable* tags are needed to allow for
> > > repurposing memory allocated to a host without needing to round trip it
> > > through the FM to get it re-tagged. While that is something the host and
> > > orchestrator can figure out on their own, it looks like a nice to have
> > > until the above questions are answered.
> > 
> > Needed?  No.  Of course not.  As you said the orchestrator software can
> > keep iterating with the FM until it gets what it wants.  It was you who
> > had the idea of a writable labels and I agreed.
> 
> Yeah, it was an idea for how to solve the problem of repurposing tag
> without needing to round trip with the FM.
> 
> > "Seemed like a good idea at the time..."  ;-)
> > 
> > As I have reviewed and rewritten this message I worry that writable labels
> > are a bad idea.  Interleaving will most likely depend on grouping extent
> > tags into the CXL/DAX extent.  With this in mind adjusting extents is
> > potentially going to require an FM interaction to get things set up
> > anyway.
> > 
> > 	[Again re-reading my message I thought of another issue.  What
> > 	happens if the user decides to change the label on an extent after
> > 	some dax device with the old label?  That seems like an additional
> > 	complication which is best left out by not allowing extent labels
> > 	to be writable.]
> 
> At least for this point extents can not be relabeled while allocated to
> an instance.

Sure but is having writeable labels worth this extra complexity?

> 
> [..]
> > My current view is:
> > 	1) No.  Current dax devices can be defined as 'no label'
> > 	2) I'm not sure.  I can see both ways having benefits.
> > 	3) No I think the ROI is not worth it.
> > 	4) The use of 'any extent label' in #2 means that available size
> > 	   retains it's meaning for no label dax devices.  Labeled dax
> > 	   devices would require a future enhancement to size information.
> 
> If the ABI is going to change in the future I don't want every debug
> session to start with "which version of daxctl were you using", or "do
> your scripts comprehend Sparse DAX Regions?".

Well then we are stuck.  Because at a minimum they will have to understand
Sparse DAX regions.  cxl create-region needs a new type to create such
regions.

I envisioned an ABI *extension* not change.  The current ABI supports dax
devices without a tag.  Even with DCD no tag is possible.  Unless you want
to restrict it, which it sounds like you do?

I'm ok with that but I know of at least 1 meeting where it was
emphatically mentioned that tags are _not_ required.  So I'd like some
community members to chime in here if requiring tags is ok.

>
> This stance is motivated
> by having seen the problems that the current ABI causes for people that want
> to do things like mitigate the "noisy neighbor" phenomenon in memory
> side caches.

Does a dax device need specific placement within the region?  That sounds
like control at the extent layer when the extent is mapped into the
region.

The mapping store interface does need to be resolved for DCD.  I could
envision the ability for user space to create extents...  Are you thinking
the same thing?

Conceptually from a top down approach _any_ dax region could be a sparse
dax region if I get what you are driving at?  Not just DCD?  In that case
extent creation is even more complicated in the DCD case.

> The allocation ABI is too simple and DCD seems to need
> more.

Are you advocating for an ABI which requires dax devices to be labeled?
It sounds like you don't want the current tool set to work on sparse dax
regions.  Is that correct?  I'm ok with that but I don't think a specific
check in the kernel is the proper way to do that.  Current dax devices are
unlabled.  So I envisioned them being supported with the current ABI.

> 
> The kernel enforced requirement for Sparse DAX Region aware tooling just
> makes it easier on us to maintain. If it means waiting until we ahve
> agreement on the allocation ABI I think that's a simple release valve.

These statements imply to me you have additional requirements for this ABI
beyond what DCD does.  I've tried to make the dax layer DCD/CXL agnostic.
But beyond having the concept of region extents which are labeled and
matched to dax devices based on that label; what other requirements on dax
to region space allocations are there?

> 
> The fundamental mechanisms can be reviewed in the meantime.

Sure,
Ira
Jonathan Cameron Sept. 14, 2023, 10:32 a.m. UTC | #9
On Wed, 13 Sep 2023 12:26:58 -0700
Ira Weiny <ira.weiny@intel.com> wrote:

> Dan Williams wrote:
> > Ira Weiny wrote:

Jumping in on randomly selected points :)

> > [..]  
> > > > 
> > > > Given that one of the expected DCD use cases is to provide just in time
> > > > memory for specific jobs the "first-available" search for free capacity
> > > > in a Sparse DAX Region collides with the need to keep allocations
> > > > bounded by tag.  
> > > 
> > > How does it collide?
> > > 
> > > My attempt here is to leave dax devices 'unlabeled'.  As such they will use
> > > space on a 'first-available' search regardless of extent labels.
> > > 
> > > Effectively I have defined 'no label' as being 'any label'.  I apologize
> > > for this detail being implicit and not explicit.
> > > 
> > > My envisioned path would be that older daxctl would continue to work like
> > > this because the kernel would not restrict unlabeled dax device creation.
> > > 
> > > Newer daxctl could use dax device labels to control the extents used.  But
> > > only when dax device labeling is introduced in a future kernel.  Use of a
> > > newer daxctl on an older DCD kernel could continue to work sans label.
> > > 
> > > In this way I envisioned a path where the policy is completely dictated by
> > > user space restricted only by the software available.  
> > 
> > Tags are a core concept in DCD. "Allocate by tag" does not feel like
> > something that can come later at least in terms of when the DCD ABI is
> > ready for upstream. So, yes, it can remain out of this patchset, but the
> > upstream merge of all of DCD would be gated on that facility arriving.  
> 
> I don't see how this can be left out of this patchset.  Without dax device
> support on DCD there is no functionality and this patchset does nothing.

Agreed - but I think one path you suggest is fine.

No label dax == no label DCD extents.

That one should be true for ever (or until writeable tags added) so
is safe and gets us going.

> 
> >   
> > > > I agree with Jonathan that unless and until the allocation scheme is
> > > > updated to be tag aware then there is no reason for allocate by tag to
> > > > exist in the interface.  
> > > 
> > > I will agree that it was perhaps premature to introduce labels on the
> > > extents.  However, I did so to give tags a space to be informationally
> > > surfaced.
> > > 
> > > IMO we must have a plan forward or wait until that plan is fully formed
> > > and implemented.  The size of this set is rather large.  Therefore, I was
> > > hoping that a plan would be enough to move forward.  
> > 
> > Leave it out for now to focus on the core mechanisms and then we can  
>        ^^^^
>        it what?
> > circle back to it.  
> 
> Again dax devices need to be created to full test this so I have to create
> them in some way.  I'm going to assume you mean 'labelless' and deal with
> labels later.
> 
> >   
> > > > That said, the next question, "is DCD enabling considered a toy until
> > > > the ability to allocate by tag is present?" I think yes, to the point
> > > > where old daxctl binaries should be made fail to create device instances
> > > > by forcing a tag to be selected at allocation time for Sparse DAX
> > > > Regions.  
> > > 
> > > Interesting.  I was not considering allocate by label to be a requirement
> > > but rather an enhancement.  Labels IMO are a further refinement of the
> > > memory space allocation.  I can see a very valid use case (not toy use
> > > case) where all the DCD memory allocated to a node is dedicated to a
> > > singular job and is done without tags or even ignoring tags.  Many HPC
> > > sites run with singular jobs per host.  
> > 
> > Is HPC going to use DCD? My impression is that HPC is statically
> > provisioned per node and that DCD is more targeted at Cloud use cases
> > where dynamic provisioning is common.  
> 
> I heard someone mention HPC in a call at some point.

I'd not rule it out.  Some HPC systems run very mixed workloads in parallel
so would benefit form Dynamic capacity - though maybe not with the same
rate of change as cloud workloads.

> 
> >   
> > > > The last question is whether *writable* tags are needed to allow for
> > > > repurposing memory allocated to a host without needing to round trip it
> > > > through the FM to get it re-tagged. While that is something the host and
> > > > orchestrator can figure out on their own, it looks like a nice to have
> > > > until the above questions are answered.  
> > > 
> > > Needed?  No.  Of course not.  As you said the orchestrator software can
> > > keep iterating with the FM until it gets what it wants.  It was you who
> > > had the idea of a writable labels and I agreed.  
> > 
> > Yeah, it was an idea for how to solve the problem of repurposing tag
> > without needing to round trip with the FM.
> >   
> > > "Seemed like a good idea at the time..."  ;-)
> > > 
> > > As I have reviewed and rewritten this message I worry that writable labels
> > > are a bad idea.  Interleaving will most likely depend on grouping extent
> > > tags into the CXL/DAX extent.  With this in mind adjusting extents is
> > > potentially going to require an FM interaction to get things set up
> > > anyway.
> > > 
> > > 	[Again re-reading my message I thought of another issue.  What
> > > 	happens if the user decides to change the label on an extent after
> > > 	some dax device with the old label?  That seems like an additional
> > > 	complication which is best left out by not allowing extent labels
> > > 	to be writable.]  
> > 
> > At least for this point extents can not be relabeled while allocated to
> > an instance.  
> 
> Sure but is having writeable labels worth this extra complexity?

No. Or not yet anyway.


> 
> > 
> > [..]  
> > > My current view is:
> > > 	1) No.  Current dax devices can be defined as 'no label'
> > > 	2) I'm not sure.  I can see both ways having benefits.

> 2) What does 'no dax device label' mean?
> 		a) any extent label
> 		b) no extent label
(that bit got cropped)

Option b seems something we can support for ever.  Not sure that works
for option a.

> > > 	3) No I think the ROI is not worth it.
> > > 	4) The use of 'any extent label' in #2 means that available size
> > > 	   retains it's meaning for no label dax devices.  Labeled dax
> > > 	   devices would require a future enhancement to size information.  
> > 
> > If the ABI is going to change in the future I don't want every debug
> > session to start with "which version of daxctl were you using", or "do
> > your scripts comprehend Sparse DAX Regions?".  
> 
> Well then we are stuck.  Because at a minimum they will have to understand
> Sparse DAX regions.  cxl create-region needs a new type to create such
> regions.
> 
> I envisioned an ABI *extension* not change.  The current ABI supports dax
> devices without a tag.  Even with DCD no tag is possible.  Unless you want
> to restrict it, which it sounds like you do?
> 
> I'm ok with that but I know of at least 1 meeting where it was
> emphatically mentioned that tags are _not_ required.  So I'd like some
> community members to chime in here if requiring tags is ok.

They are definitely not required and I don't think we want to make
it a Linux requirement that tags are needed.


> 
> >
> > This stance is motivated
> > by having seen the problems that the current ABI causes for people that want
> > to do things like mitigate the "noisy neighbor" phenomenon in memory
> > side caches.  
> 
> Does a dax device need specific placement within the region?  That sounds
> like control at the extent layer when the extent is mapped into the
> region.
> 
> The mapping store interface does need to be resolved for DCD.  I could
> envision the ability for user space to create extents...  Are you thinking
> the same thing?
> 
> Conceptually from a top down approach _any_ dax region could be a sparse
> dax region if I get what you are driving at?  Not just DCD?  In that case
> extent creation is even more complicated in the DCD case.

For now at least I'd push any clever noisy neighbour mess on to the
Fabric manager once we have tags.  Not sure the OS even has the visibility to
do this sort of fine tuning.  We could provide it of course, but that's
a whole level of system description that we don't have today.

> 
> > The allocation ABI is too simple and DCD seems to need
> > more.  
> 
> Are you advocating for an ABI which requires dax devices to be labeled?
> It sounds like you don't want the current tool set to work on sparse dax
> regions.  Is that correct?  I'm ok with that but I don't think a specific
> check in the kernel is the proper way to do that.  Current dax devices are
> unlabled.  So I envisioned them being supported with the current ABI.
> 
> > 
> > The kernel enforced requirement for Sparse DAX Region aware tooling just
> > makes it easier on us to maintain. If it means waiting until we ahve
> > agreement on the allocation ABI I think that's a simple release valve.  
> 
> These statements imply to me you have additional requirements for this ABI
> beyond what DCD does.  I've tried to make the dax layer DCD/CXL agnostic.
> But beyond having the concept of region extents which are labeled and
> matched to dax devices based on that label; what other requirements on dax
> to region space allocations are there?
> 
> > 
> > The fundamental mechanisms can be reviewed in the meantime.  
> 
> Sure,
> Ira
>
diff mbox series

Patch

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index ea7ae82b4687..a9ea6a706702 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -280,6 +280,36 @@  static ssize_t region_align_show(struct device *dev,
 static struct device_attribute dev_attr_region_align =
 		__ATTR(align, 0400, region_align_show, NULL);
 
+#define for_each_extent_resource(extent, res) \
+	for (res = (extent)->child; res; res = res->sibling)
+
+static unsigned long long
+dr_extent_avail_size(struct dax_region_extent *dr_extent)
+{
+	unsigned long long rc;
+	struct resource *res;
+
+	rc = resource_size(dr_extent->res);
+	for_each_extent_resource(dr_extent->res, res)
+		rc -= resource_size(res);
+	return rc;
+}
+
+static int dax_region_add_dynamic_size(struct device *dev, void *data)
+{
+	unsigned long long *size = data, ext_size;
+	struct dax_reg_ext_dev *dr_reg_ext_dev;
+
+	if (!is_dr_ext_dev(dev))
+		return 0;
+
+	dr_reg_ext_dev = to_dr_ext_dev(dev);
+	ext_size = dr_extent_avail_size(dr_reg_ext_dev->dr_extent);
+	dev_dbg(dev, "size %llx\n", ext_size);
+	*size += ext_size;
+	return 0;
+}
+
 #define for_each_dax_region_resource(dax_region, res) \
 	for (res = (dax_region)->res.child; res; res = res->sibling)
 
@@ -290,8 +320,12 @@  static unsigned long long dax_region_avail_size(struct dax_region *dax_region)
 
 	device_lock_assert(dax_region->dev);
 
-	if (is_dynamic(dax_region))
-		return 0;
+	if (is_dynamic(dax_region)) {
+		size = 0;
+		device_for_each_child(dax_region->dev, &size,
+				      dax_region_add_dynamic_size);
+		return size;
+	}
 
 	for_each_dax_region_resource(dax_region, res)
 		size -= resource_size(res);
@@ -421,15 +455,24 @@  EXPORT_SYMBOL_GPL(kill_dev_dax);
 static void trim_dev_dax_range(struct dev_dax *dev_dax)
 {
 	int i = dev_dax->nr_range - 1;
-	struct range *range = &dev_dax->ranges[i].range;
+	struct dev_dax_range *dev_range = &dev_dax->ranges[i];
+	struct range *range = &dev_range->range;
 	struct dax_region *dax_region = dev_dax->region;
+	struct resource *res = &dax_region->res;
 
 	device_lock_assert(dax_region->dev);
 	dev_dbg(&dev_dax->dev, "delete range[%d]: %#llx:%#llx\n", i,
 		(unsigned long long)range->start,
 		(unsigned long long)range->end);
 
-	__release_region(&dax_region->res, range->start, range_len(range));
+	if (dev_range->dr_extent)
+		res = dev_range->dr_extent->res;
+
+	__release_region(res, range->start, range_len(range));
+
+	if (dev_range->dr_extent)
+		dr_extent_put(dev_range->dr_extent);
+
 	if (--dev_dax->nr_range == 0) {
 		kfree(dev_dax->ranges);
 		dev_dax->ranges = NULL;
@@ -818,7 +861,8 @@  static int devm_register_dax_mapping(struct dev_dax *dev_dax, int range_id)
 }
 
 static int alloc_dev_dax_range(struct resource *parent, struct dev_dax *dev_dax,
-			       u64 start, resource_size_t size)
+			       u64 start, resource_size_t size,
+			       struct dax_region_extent *dr_extent)
 {
 	struct dax_region *dax_region = dev_dax->region;
 	struct device *dev = &dev_dax->dev;
@@ -852,12 +896,15 @@  static int alloc_dev_dax_range(struct resource *parent, struct dev_dax *dev_dax,
 	for (i = 0; i < dev_dax->nr_range; i++)
 		pgoff += PHYS_PFN(range_len(&ranges[i].range));
 	dev_dax->ranges = ranges;
+	if (dr_extent)
+		dr_extent_get(dr_extent);
 	ranges[dev_dax->nr_range++] = (struct dev_dax_range) {
 		.pgoff = pgoff,
 		.range = {
 			.start = alloc->start,
 			.end = alloc->end,
 		},
+		.dr_extent = dr_extent,
 	};
 
 	dev_dbg(dev, "alloc range[%d]: %pa:%pa\n", dev_dax->nr_range - 1,
@@ -938,7 +985,8 @@  static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
 	int i;
 
 	for (i = dev_dax->nr_range - 1; i >= 0; i--) {
-		struct range *range = &dev_dax->ranges[i].range;
+		struct dev_dax_range *dev_range = &dev_dax->ranges[i];
+		struct range *range = &dev_range->range;
 		struct dax_mapping *mapping = dev_dax->ranges[i].mapping;
 		struct resource *adjust = NULL, *res;
 		resource_size_t shrink;
@@ -954,12 +1002,16 @@  static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
 			continue;
 		}
 
-		for_each_dax_region_resource(dax_region, res)
-			if (strcmp(res->name, dev_name(dev)) == 0
-					&& res->start == range->start) {
-				adjust = res;
-				break;
-			}
+		if (dev_range->dr_extent) {
+			adjust = dev_range->dr_extent->res;
+		} else {
+			for_each_dax_region_resource(dax_region, res)
+				if (strcmp(res->name, dev_name(dev)) == 0
+						&& res->start == range->start) {
+					adjust = res;
+					break;
+				}
+		}
 
 		if (dev_WARN_ONCE(dev, !adjust || i != dev_dax->nr_range - 1,
 					"failed to find matching resource\n"))
@@ -973,12 +1025,15 @@  static int dev_dax_shrink(struct dev_dax *dev_dax, resource_size_t size)
 /*
  * Only allow adjustments that preserve the relative pgoff of existing
  * allocations. I.e. the dev_dax->ranges array is ordered by increasing pgoff.
+ * Dissallow adjustments on dynamic regions as they can come from all over.
  */
 static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res)
 {
 	struct dev_dax_range *last;
 	int i;
 
+	if (is_dynamic(dev_dax->region))
+		return false;
 	if (dev_dax->nr_range == 0)
 		return false;
 	if (strcmp(res->name, dev_name(&dev_dax->dev)) != 0)
@@ -997,19 +1052,21 @@  static bool adjust_ok(struct dev_dax *dev_dax, struct resource *res)
 }
 
 /*
- * dev_dax_resize_static - Expand the device into the unused portion of the
- * region. This may involve adjusting the end of an existing resource, or
- * allocating a new resource.
+ * __dev_dax_resize - Expand the device into the unused portion of the region.
+ * This may involve adjusting the end of an existing resource, or allocating a
+ * new resource.
  *
  * @parent: parent resource to allocate this range in.
  * @dev_dax: DAX device we are creating this range for
  * @to_alloc: amount of space to alloc; must be <= space available in @parent
+ * @dr_extent: if dynamic; the extent containing parent
  *
  * Return the amount of space allocated or -ERRNO on failure
  */
-static ssize_t dev_dax_resize_static(struct resource *parent,
-				     struct dev_dax *dev_dax,
-				     resource_size_t to_alloc)
+static ssize_t __dev_dax_resize(struct resource *parent,
+				struct dev_dax *dev_dax,
+				resource_size_t to_alloc,
+				struct dax_region_extent *dr_extent)
 {
 	struct resource *res, *first;
 	int rc;
@@ -1017,7 +1074,8 @@  static ssize_t dev_dax_resize_static(struct resource *parent,
 	first = parent->child;
 	if (!first) {
 		rc = alloc_dev_dax_range(parent, dev_dax,
-					   parent->start, to_alloc);
+					   parent->start, to_alloc,
+					   dr_extent);
 		if (rc)
 			return rc;
 		return to_alloc;
@@ -1031,7 +1089,8 @@  static ssize_t dev_dax_resize_static(struct resource *parent,
 		if (res == first && res->start > parent->start) {
 			alloc = min(res->start - parent->start, to_alloc);
 			rc = alloc_dev_dax_range(parent, dev_dax,
-						 parent->start, alloc);
+						 parent->start, alloc,
+						 dr_extent);
 			if (rc)
 				return rc;
 			return alloc;
@@ -1055,7 +1114,8 @@  static ssize_t dev_dax_resize_static(struct resource *parent,
 				return rc;
 			return alloc;
 		}
-		rc = alloc_dev_dax_range(parent, dev_dax, res->end + 1, alloc);
+		rc = alloc_dev_dax_range(parent, dev_dax, res->end + 1, alloc,
+					 dr_extent);
 		if (rc)
 			return rc;
 		return alloc;
@@ -1066,6 +1126,47 @@  static ssize_t dev_dax_resize_static(struct resource *parent,
 	return 0;
 }
 
+static ssize_t dev_dax_resize_static(struct dax_region *dax_region,
+				     struct dev_dax *dev_dax,
+				     resource_size_t to_alloc)
+{
+	return __dev_dax_resize(&dax_region->res, dev_dax, to_alloc, NULL);
+}
+
+static int dax_region_find_space(struct device *dev, void *data)
+{
+	struct dax_reg_ext_dev *dr_reg_ext_dev;
+
+	if (!is_dr_ext_dev(dev))
+		return 0;
+
+	dr_reg_ext_dev = to_dr_ext_dev(dev);
+	return dr_extent_avail_size(dr_reg_ext_dev->dr_extent);
+}
+
+static ssize_t dev_dax_resize_dynamic(struct dax_region *dax_region,
+				      struct dev_dax *dev_dax,
+				      resource_size_t to_alloc)
+{
+	struct dax_reg_ext_dev *dr_reg_ext_dev;
+	struct dax_region_extent *dr_extent;
+	resource_size_t alloc;
+	resource_size_t extent_max;
+	struct device *dev;
+
+	dev = device_find_child(dax_region->dev, NULL, dax_region_find_space);
+	if (dev_WARN_ONCE(dax_region->dev, !dev, "Space should be available!"))
+		return -ENOSPC;
+	dr_reg_ext_dev = to_dr_ext_dev(dev);
+	dr_extent = dr_reg_ext_dev->dr_extent;
+	extent_max = dr_extent_avail_size(dr_extent);
+	to_alloc = min(extent_max, to_alloc);
+	alloc = __dev_dax_resize(dr_extent->res, dev_dax, to_alloc, dr_extent);
+	put_device(dev);
+
+	return alloc;
+}
+
 static ssize_t dev_dax_resize(struct dax_region *dax_region,
 		struct dev_dax *dev_dax, resource_size_t size)
 {
@@ -1089,7 +1190,10 @@  static ssize_t dev_dax_resize(struct dax_region *dax_region,
 		return -ENXIO;
 
 retry:
-	alloc = dev_dax_resize_static(&dax_region->res, dev_dax, to_alloc);
+	if (is_dynamic(dax_region))
+		alloc = dev_dax_resize_dynamic(dax_region, dev_dax, to_alloc);
+	else
+		alloc = dev_dax_resize_static(dax_region, dev_dax, to_alloc);
 	if (alloc <= 0)
 		return alloc;
 	to_alloc -= alloc;
@@ -1168,6 +1272,9 @@  static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
 	struct range r;
 	ssize_t rc;
 
+	if (is_dynamic(dax_region))
+		return -EINVAL;
+
 	rc = range_parse(buf, len, &r);
 	if (rc)
 		return rc;
@@ -1183,7 +1290,7 @@  static ssize_t mapping_store(struct device *dev, struct device_attribute *attr,
 	to_alloc = range_len(&r);
 	if (alloc_is_aligned(dev_dax, to_alloc))
 		rc = alloc_dev_dax_range(&dax_region->res, dev_dax, r.start,
-					 to_alloc);
+					 to_alloc, NULL);
 	device_unlock(dev);
 	device_unlock(dax_region->dev);
 
@@ -1400,8 +1507,10 @@  struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data)
 	device_initialize(dev);
 	dev_set_name(dev, "dax%d.%d", dax_region->id, dev_dax->id);
 
+	dev_WARN_ONCE(parent, is_dynamic(dax_region) && data->size,
+		      "Dynamic DAX devices are created initially with 0 size");
 	rc = alloc_dev_dax_range(&dax_region->res, dev_dax, dax_region->res.start,
-				 data->size);
+				 data->size, NULL);
 	if (rc)
 		goto err_range;
 
diff --git a/drivers/dax/cxl.c b/drivers/dax/cxl.c
index 44cbd28668f1..6394a3531e25 100644
--- a/drivers/dax/cxl.c
+++ b/drivers/dax/cxl.c
@@ -12,6 +12,17 @@  static void dax_reg_ext_get(struct dax_region_extent *dr_extent)
 	kref_get(&dr_extent->ref);
 }
 
+
+static void dax_region_rm_resource(struct dax_region_extent *dr_extent)
+{
+	struct dax_region *dax_region = dr_extent->region;
+	struct resource *res = dr_extent->res;
+	
+	dev_dbg(dax_region->dev, "Extent release resource %pR\n",
+		dr_extent->res);
+	__release_region(&dax_region->res, res->start, resource_size(res));
+}
+
 static void dr_release(struct kref *kref)
 {
 	struct dax_region_extent *dr_extent;
@@ -19,6 +30,7 @@  static void dr_release(struct kref *kref)
 
 	dr_extent = container_of(kref, struct dax_region_extent, ref);
 	cxl_dr_ext = dr_extent->private_data;
+	dax_region_rm_resource(dr_extent);
 	cxl_dr_extent_put(cxl_dr_ext);
 	kfree(dr_extent);
 }
@@ -28,6 +40,29 @@  static void dax_reg_ext_put(struct dax_region_extent *dr_extent)
 	kref_put(&dr_extent->ref, dr_release);
 }
 
+static int dax_region_add_resource(struct dax_region *dax_region,
+				   struct dax_region_extent *dr_extent,
+				   resource_size_t offset,
+				   resource_size_t length)
+{
+	resource_size_t start = dax_region->res.start + offset;
+	struct resource *ext_res;
+
+	dev_dbg(dax_region->dev, "DAX region resource %pR\n", &dax_region->res);
+	ext_res = __request_region(&dax_region->res, start, length, "extent", 0);
+	if (!ext_res) {
+		dev_err(dax_region->dev, "Failed to add extent s:%llx l:%llx\n",
+			start, length);
+		return -ENOSPC;
+	}
+
+	dr_extent->region = dax_region;
+	dr_extent->res = ext_res;
+	dev_dbg(dax_region->dev, "Extent add resource %pR\n", ext_res);
+
+	return 0;
+}
+
 static int cxl_dax_region_create_extent(struct dax_region *dax_region,
 					struct cxl_dr_extent *cxl_dr_ext)
 {
@@ -45,11 +80,20 @@  static int cxl_dax_region_create_extent(struct dax_region *dax_region,
 	/* device manages the dr_extent on success */
 	kref_init(&dr_extent->ref);
 
+	rc = dax_region_add_resource(dax_region, dr_extent,
+				     cxl_dr_ext->hpa_offset,
+				     cxl_dr_ext->hpa_length);
+	if (rc) {
+		kfree(dr_extent);
+		return rc;
+	}
+
 	rc = dax_region_ext_create_dev(dax_region, dr_extent,
 				       cxl_dr_ext->hpa_offset,
 				       cxl_dr_ext->hpa_length,
 				       cxl_dr_ext->label);
 	if (rc) {
+		dax_region_rm_resource(dr_extent);
 		kfree(dr_extent);
 		return rc;
 	}
diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 250babd6e470..ad73b53aa802 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -44,12 +44,16 @@  struct dax_region {
 /*
  * struct dax_region_extent - extent data defined by the low level region
  * driver.
+ * @region: cache of dax_region
+ * @res: cache of resource tree for this extent
  * @private_data: lower level region driver data
  * @ref: track number of dax devices which are using this extent
  * @get: get reference to low level data
  * @put: put reference to low level data
  */
 struct dax_region_extent {
+	struct dax_region *region;
+	struct resource *res;
 	void *private_data;
 	struct kref ref;
 	void (*get)(struct dax_region_extent *dr_extent);
@@ -131,6 +135,7 @@  struct dev_dax {
 		unsigned long pgoff;
 		struct range range;
 		struct dax_mapping *mapping;
+		struct dax_region_extent *dr_extent;
 	} *ranges;
 };