diff mbox series

[36/46] cxl/region: Add interleave ways attribute

Message ID 20220624041950.559155-11-dan.j.williams@intel.com (mailing list archive)
State Superseded
Headers show
Series CXL PMEM Region Provisioning | expand

Commit Message

Dan Williams June 24, 2022, 4:19 a.m. UTC
From: Ben Widawsky <bwidawsk@kernel.org>

Add an ABI to allow the number of devices that comprise a region to be
set.

Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
[djbw: reword changelog]
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 Documentation/ABI/testing/sysfs-bus-cxl |  21 ++++
 drivers/cxl/core/region.c               | 128 ++++++++++++++++++++++++
 drivers/cxl/cxl.h                       |  33 ++++++
 3 files changed, 182 insertions(+)

Comments

Jonathan Cameron June 30, 2022, 1:44 p.m. UTC | #1
On Thu, 23 Jun 2022 21:19:40 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> From: Ben Widawsky <bwidawsk@kernel.org>
> 
> Add an ABI to allow the number of devices that comprise a region to be
> set.

Should at least mention interleave_granularity is being added as well!

> 
> Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
> [djbw: reword changelog]
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Random diversion inline...

> ---
>  Documentation/ABI/testing/sysfs-bus-cxl |  21 ++++
>  drivers/cxl/core/region.c               | 128 ++++++++++++++++++++++++
>  drivers/cxl/cxl.h                       |  33 ++++++
>  3 files changed, 182 insertions(+)

> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index f75978f846b9..78af42454760 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -7,6 +7,7 @@


> +static ssize_t interleave_granularity_store(struct device *dev,
> +					    struct device_attribute *attr,
> +					    const char *buf, size_t len)
> +{
> +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
> +	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
> +	struct cxl_region *cxlr = to_cxl_region(dev);
> +	struct cxl_region_params *p = &cxlr->params;
> +	int rc, val;
> +	u16 ig;
> +
> +	rc = kstrtoint(buf, 0, &val);
> +	if (rc)
> +		return rc;
> +
> +	rc = granularity_to_cxl(val, &ig);
> +	if (rc)
> +		return rc;
> +
> +	/* region granularity must be >= root granularity */

In general I think that's an implementation choice.  Sure today
we only support it this way, but it's perfectly possible to build
setups where that's not the case.  Maybe the comment should say
that this code goes with an implementation choice inline with
the software guide (that argues you will always prefer small
ig for interleaving at the host to make best use of bandwidth etc).

Interestingly the code I was previously testing QEMU with
allowed that option (might have been only option that worked).
That code was a mixture of Ben's earlier version and my own hacks.
It probably doesn't make sense to support other ways of picking
the interleaving granularity until / if we ever get a request
to do so. 

I think it results in a different device ordering.

Ordering with this

    Host
     | 4k
    / \
   /   \  
  HB   HB  8k
  |     |
 / \   / \
0  2   1  3

Ordering with Larger granularity CFMWS over finer granularity HB

    Host
     | 8k
    / \
   /   \ 
  HB   HB 4k
  |     |
 / \   / \
0  1   2  3

Not clear why you'd do the second one though :)  So can ignore for now.


> +	if (val < cxld->interleave_granularity)
> +		return -EINVAL;
> +
> +	rc = down_write_killable(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
> +		rc = -EBUSY;
> +		goto out;
> +	}
> +
> +	p->interleave_granularity = val;
> +out:
> +	up_read(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	return len;
> +}
> +static DEVICE_ATTR_RW(interleave_granularity);
> +
>  static struct attribute *cxl_region_attrs[] = {
>  	&dev_attr_uuid.attr,
> +	&dev_attr_interleave_ways.attr,
> +	&dev_attr_interleave_granularity.attr,
>  	NULL,
>  };
>
Jonathan Cameron June 30, 2022, 1:45 p.m. UTC | #2
On Thu, 23 Jun 2022 21:19:40 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> From: Ben Widawsky <bwidawsk@kernel.org>
> 
> Add an ABI to allow the number of devices that comprise a region to be
> set.
> 
> Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
> [djbw: reword changelog]
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Forgot to say that with mention of the granularity in the patch
description I'm fine with this rest of this.

Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>

> ---
>  Documentation/ABI/testing/sysfs-bus-cxl |  21 ++++
>  drivers/cxl/core/region.c               | 128 ++++++++++++++++++++++++
>  drivers/cxl/cxl.h                       |  33 ++++++
>  3 files changed, 182 insertions(+)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
> index d30c95a758a9..46d5295c1149 100644
> --- a/Documentation/ABI/testing/sysfs-bus-cxl
> +++ b/Documentation/ABI/testing/sysfs-bus-cxl
> @@ -273,3 +273,24 @@ Description:
>  		(RW) Write a unique identifier for the region. This field must
>  		be set for persistent regions and it must not conflict with the
>  		UUID of another region.
> +
> +
> +What:		/sys/bus/cxl/devices/regionZ/interleave_granularity
> +Date:		May, 2022
> +KernelVersion:	v5.20
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(RW) Set the number of consecutive bytes each device in the
> +		interleave set will claim. The possible interleave granularity
> +		values are determined by the CXL spec and the participating
> +		devices.
> +
> +
> +What:		/sys/bus/cxl/devices/regionZ/interleave_ways
> +Date:		May, 2022
> +KernelVersion:	v5.20
> +Contact:	linux-cxl@vger.kernel.org
> +Description:
> +		(RW) Configures the number of devices participating in the
> +		region is set by writing this value. Each device will provide
> +		1/interleave_ways of storage for the region.
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index f75978f846b9..78af42454760 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -7,6 +7,7 @@
>  #include <linux/slab.h>
>  #include <linux/uuid.h>
>  #include <linux/idr.h>
> +#include <cxlmem.h>
>  #include <cxl.h>
>  #include "core.h"
>  
> @@ -21,6 +22,8 @@
>   *
>   * Region configuration has ordering constraints. UUID may be set at any time
>   * but is only visible for persistent regions.
> + * 1. Interleave granularity
> + * 2. Interleave size
>   */
>  
>  /*
> @@ -119,8 +122,129 @@ static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
>  	return a->mode;
>  }
>  
> +static ssize_t interleave_ways_show(struct device *dev,
> +				    struct device_attribute *attr, char *buf)
> +{
> +	struct cxl_region *cxlr = to_cxl_region(dev);
> +	struct cxl_region_params *p = &cxlr->params;
> +	ssize_t rc;
> +
> +	rc = down_read_interruptible(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	rc = sysfs_emit(buf, "%d\n", p->interleave_ways);
> +	up_read(&cxl_region_rwsem);
> +
> +	return rc;
> +}
> +
> +static ssize_t interleave_ways_store(struct device *dev,
> +				     struct device_attribute *attr,
> +				     const char *buf, size_t len)
> +{
> +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
> +	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
> +	struct cxl_region *cxlr = to_cxl_region(dev);
> +	struct cxl_region_params *p = &cxlr->params;
> +	int rc, val;
> +	u8 iw;
> +
> +	rc = kstrtoint(buf, 0, &val);
> +	if (rc)
> +		return rc;
> +
> +	rc = ways_to_cxl(val, &iw);
> +	if (rc)
> +		return rc;
> +
> +	/*
> +	 * Even for x3, x9, and x12 interleaves the region interleave must be a
> +	 * power of 2 multiple of the host bridge interleave.
> +	 */
> +	if (!is_power_of_2(val / cxld->interleave_ways) ||
> +	    (val % cxld->interleave_ways)) {
> +		dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
> +		return -EINVAL;
> +	}
> +
> +	rc = down_write_killable(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
> +		rc = -EBUSY;
> +		goto out;
> +	}
> +
> +	p->interleave_ways = val;
> +out:
> +	up_read(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	return len;
> +}
> +static DEVICE_ATTR_RW(interleave_ways);
> +
> +static ssize_t interleave_granularity_show(struct device *dev,
> +					   struct device_attribute *attr,
> +					   char *buf)
> +{
> +	struct cxl_region *cxlr = to_cxl_region(dev);
> +	struct cxl_region_params *p = &cxlr->params;
> +	ssize_t rc;
> +
> +	rc = down_read_interruptible(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	rc = sysfs_emit(buf, "%d\n", p->interleave_granularity);
> +	up_read(&cxl_region_rwsem);
> +
> +	return rc;
> +}
> +
> +static ssize_t interleave_granularity_store(struct device *dev,
> +					    struct device_attribute *attr,
> +					    const char *buf, size_t len)
> +{
> +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
> +	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
> +	struct cxl_region *cxlr = to_cxl_region(dev);
> +	struct cxl_region_params *p = &cxlr->params;
> +	int rc, val;
> +	u16 ig;
> +
> +	rc = kstrtoint(buf, 0, &val);
> +	if (rc)
> +		return rc;
> +
> +	rc = granularity_to_cxl(val, &ig);
> +	if (rc)
> +		return rc;
> +
> +	/* region granularity must be >= root granularity */
> +	if (val < cxld->interleave_granularity)
> +		return -EINVAL;
> +
> +	rc = down_write_killable(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
> +		rc = -EBUSY;
> +		goto out;
> +	}
> +
> +	p->interleave_granularity = val;
> +out:
> +	up_read(&cxl_region_rwsem);
> +	if (rc)
> +		return rc;
> +	return len;
> +}
> +static DEVICE_ATTR_RW(interleave_granularity);
> +
>  static struct attribute *cxl_region_attrs[] = {
>  	&dev_attr_uuid.attr,
> +	&dev_attr_interleave_ways.attr,
> +	&dev_attr_interleave_granularity.attr,
>  	NULL,
>  };
>  
> @@ -212,6 +336,8 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  					      enum cxl_decoder_type type)
>  {
>  	struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
> +	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
> +	struct cxl_region_params *p;
>  	struct cxl_region *cxlr;
>  	struct device *dev;
>  	int rc;
> @@ -219,8 +345,10 @@ static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
>  	cxlr = cxl_region_alloc(cxlrd, id);
>  	if (IS_ERR(cxlr))
>  		return cxlr;
> +	p = &cxlr->params;
>  	cxlr->mode = mode;
>  	cxlr->type = type;
> +	p->interleave_granularity = cxld->interleave_granularity;
>  
>  	dev = &cxlr->dev;
>  	rc = dev_set_name(dev, "region%d", id);
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 46a9f8acc602..13ee04b00e0c 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -7,6 +7,7 @@
>  #include <linux/libnvdimm.h>
>  #include <linux/bitfield.h>
>  #include <linux/bitops.h>
> +#include <linux/log2.h>
>  #include <linux/io.h>
>  
>  /**
> @@ -92,6 +93,31 @@ static inline int cxl_to_ways(u8 eniw, unsigned int *val)
>  	return 0;
>  }
>  
> +static inline int granularity_to_cxl(int g, u16 *ig)
> +{
> +	if (g > SZ_16K || g < 256 || !is_power_of_2(g))
> +		return -EINVAL;
> +	*ig = ilog2(g) - 8;
> +	return 0;
> +}
> +
> +static inline int ways_to_cxl(int ways, u8 *iw)
> +{
> +	if (ways > 16)
> +		return -EINVAL;
> +	if (is_power_of_2(ways)) {
> +		*iw = ilog2(ways);
> +		return 0;
> +	}
> +	if (ways % 3)
> +		return -EINVAL;
> +	ways /= 3;
> +	if (!is_power_of_2(ways))
> +		return -EINVAL;
> +	*iw = ilog2(ways) + 8;
> +	return 0;
> +}
> +
>  /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */
>  #define CXLDEV_CAP_ARRAY_OFFSET 0x0
>  #define   CXLDEV_CAP_ARRAY_CAP_ID 0
> @@ -291,11 +317,14 @@ struct cxl_root_decoder {
>  /*
>   * enum cxl_config_state - State machine for region configuration
>   * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely
> + * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more
> + * changes to interleave_ways or interleave_granularity
>   * @CXL_CONFIG_ACTIVE: All targets have been added the region is now
>   * active
>   */
>  enum cxl_config_state {
>  	CXL_CONFIG_IDLE,
> +	CXL_CONFIG_INTERLEAVE_ACTIVE,
>  	CXL_CONFIG_ACTIVE,
>  };
>  
> @@ -303,12 +332,16 @@ enum cxl_config_state {
>   * struct cxl_region_params - region settings
>   * @state: allow the driver to lockdown further parameter changes
>   * @uuid: unique id for persistent regions
> + * @interleave_ways: number of endpoints in the region
> + * @interleave_granularity: capacity each endpoint contributes to a stripe
>   *
>   * State transitions are protected by the cxl_region_rwsem
>   */
>  struct cxl_region_params {
>  	enum cxl_config_state state;
>  	uuid_t uuid;
> +	int interleave_ways;
> +	int interleave_granularity;
>  };
>  
>  /**
Dan Williams July 11, 2022, 12:32 a.m. UTC | #3
Jonathan Cameron wrote:
> On Thu, 23 Jun 2022 21:19:40 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
> 
> > From: Ben Widawsky <bwidawsk@kernel.org>
> > 
> > Add an ABI to allow the number of devices that comprise a region to be
> > set.
> 
> Should at least mention interleave_granularity is being added as well!

Added.

> 
> > 
> > Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
> > [djbw: reword changelog]
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> 
> Random diversion inline...
> 
> > ---
> >  Documentation/ABI/testing/sysfs-bus-cxl |  21 ++++
> >  drivers/cxl/core/region.c               | 128 ++++++++++++++++++++++++
> >  drivers/cxl/cxl.h                       |  33 ++++++
> >  3 files changed, 182 insertions(+)
> 
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index f75978f846b9..78af42454760 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -7,6 +7,7 @@
> 
> 
> > +static ssize_t interleave_granularity_store(struct device *dev,
> > +					    struct device_attribute *attr,
> > +					    const char *buf, size_t len)
> > +{
> > +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
> > +	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
> > +	struct cxl_region *cxlr = to_cxl_region(dev);
> > +	struct cxl_region_params *p = &cxlr->params;
> > +	int rc, val;
> > +	u16 ig;
> > +
> > +	rc = kstrtoint(buf, 0, &val);
> > +	if (rc)
> > +		return rc;
> > +
> > +	rc = granularity_to_cxl(val, &ig);
> > +	if (rc)
> > +		return rc;
> > +
> > +	/* region granularity must be >= root granularity */
> 
> In general I think that's an implementation choice.  Sure today
> we only support it this way, but it's perfectly possible to build
> setups where that's not the case.

If the region granularity is smaller than the host bridge interleave
granularity it means that multiple devices per host bridge are needed to
satsify a single "slot" in the interleave. Valid? Yes. Useful for Linux
to support, not clear.

> Maybe the comment should say that this code goes with an
> implementation choice inline with the software guide (that argues you
> will always prefer small ig for interleaving at the host to make best
> use of bandwidth etc).

No, I would prefer that as far as the Linux implementation is concerned
the software-guide does not exist. In the sense that the Linux
implementation choices supersede and are otherwise a superset of what
the guide recommends.

Also, for the same reason that the code does not anticipate future
implementation possibilities, neither should the comments. It is
sufficient to just change this comment when / if the implemetation stops
expecting region granularity >= root granularity.

> Interestingly the code I was previously testing QEMU with
> allowed that option (might have been only option that worked).
> That code was a mixture of Ben's earlier version and my own hacks.
> It probably doesn't make sense to support other ways of picking
> the interleaving granularity until / if we ever get a request
> to do so. 
> 
> I think it results in a different device ordering.
> 
> Ordering with this
> 
>     Host
>      | 4k
>     / \
>    /   \  
>   HB   HB  8k
>   |     |
>  / \   / \
> 0  2   1  3
> 
> Ordering with Larger granularity CFMWS over finer granularity HB
> 
>     Host
>      | 8k
>     / \
>    /   \ 
>   HB   HB 4k
>   |     |
>  / \   / \
> 0  1   2  3
> 
> Not clear why you'd do the second one though :)  So can ignore for now.

All I can think of is "ZOMG! My platform failed and the only one I have
to recover my data has HB interleaves with larger granularity than my
failed system!". Otherwise, I expect cross-platform CXL persistent
memory recovery to be so rare as to not need to spend time too much time
worrying about it now. It seems a straightforward constraint to lift at
a later date without any risk to breaking the ABI.
Jonathan Cameron July 19, 2022, 2:47 p.m. UTC | #4
On Sun, 10 Jul 2022 17:32:26 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Jonathan Cameron wrote:
> > On Thu, 23 Jun 2022 21:19:40 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >   
> > > From: Ben Widawsky <bwidawsk@kernel.org>
> > > 
> > > Add an ABI to allow the number of devices that comprise a region to be
> > > set.  
> > 
> > Should at least mention interleave_granularity is being added as well!  
> 
> Added.
> 
> >   
> > > 
> > > Signed-off-by: Ben Widawsky <bwidawsk@kernel.org>
> > > [djbw: reword changelog]
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > 
> > Random diversion inline...
> >   
> > > ---
> > >  Documentation/ABI/testing/sysfs-bus-cxl |  21 ++++
> > >  drivers/cxl/core/region.c               | 128 ++++++++++++++++++++++++
> > >  drivers/cxl/cxl.h                       |  33 ++++++
> > >  3 files changed, 182 insertions(+)  
> >   
> > > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > > index f75978f846b9..78af42454760 100644
> > > --- a/drivers/cxl/core/region.c
> > > +++ b/drivers/cxl/core/region.c
> > > @@ -7,6 +7,7 @@  
> > 
> >   
> > > +static ssize_t interleave_granularity_store(struct device *dev,
> > > +					    struct device_attribute *attr,
> > > +					    const char *buf, size_t len)
> > > +{
> > > +	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
> > > +	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
> > > +	struct cxl_region *cxlr = to_cxl_region(dev);
> > > +	struct cxl_region_params *p = &cxlr->params;
> > > +	int rc, val;
> > > +	u16 ig;
> > > +
> > > +	rc = kstrtoint(buf, 0, &val);
> > > +	if (rc)
> > > +		return rc;
> > > +
> > > +	rc = granularity_to_cxl(val, &ig);
> > > +	if (rc)
> > > +		return rc;
> > > +
> > > +	/* region granularity must be >= root granularity */  
> > 
> > In general I think that's an implementation choice.  Sure today
> > we only support it this way, but it's perfectly possible to build
> > setups where that's not the case.  
> 
> If the region granularity is smaller than the host bridge interleave
> granularity it means that multiple devices per host bridge are needed to
> satsify a single "slot" in the interleave. Valid? Yes. Useful for Linux
> to support, not clear.

True.  Wait and see on this one makes sense to me. I only noticed because
my older test scripts (against hacks on top of Ben's code) were broken as
I did it the silly way :)

> 
> > Maybe the comment should say that this code goes with an
> > implementation choice inline with the software guide (that argues you
> > will always prefer small ig for interleaving at the host to make best
> > use of bandwidth etc).  
> 
> No, I would prefer that as far as the Linux implementation is concerned
> the software-guide does not exist. In the sense that the Linux
> implementation choices supersede and are otherwise a superset of what
> the guide recommends.

ah. I phrased that badly. I just meant lift the argument as a comment rather
than a cross reference.

> 
> Also, for the same reason that the code does not anticipate future
> implementation possibilities, neither should the comments. It is
> sufficient to just change this comment when / if the implemetation stops
> expecting region granularity >= root granularity.
> 
> > Interestingly the code I was previously testing QEMU with
> > allowed that option (might have been only option that worked).
> > That code was a mixture of Ben's earlier version and my own hacks.
> > It probably doesn't make sense to support other ways of picking
> > the interleaving granularity until / if we ever get a request
> > to do so. 
> > 
> > I think it results in a different device ordering.
> > 
> > Ordering with this
> > 
> >     Host
> >      | 4k
> >     / \
> >    /   \  
> >   HB   HB  8k
> >   |     |
> >  / \   / \
> > 0  2   1  3
> > 
> > Ordering with Larger granularity CFMWS over finer granularity HB
> > 
> >     Host
> >      | 8k
> >     / \
> >    /   \ 
> >   HB   HB 4k
> >   |     |
> >  / \   / \
> > 0  1   2  3
> > 
> > Not clear why you'd do the second one though :)  So can ignore for now.  
> 
> All I can think of is "ZOMG! My platform failed and the only one I have
> to recover my data has HB interleaves with larger granularity than my
> failed system!". Otherwise, I expect cross-platform CXL persistent
> memory recovery to be so rare as to not need to spend time too much time
> worrying about it now. It seems a straightforward constraint to lift at
> a later date without any risk to breaking the ABI.

It was cross platform that I was thinking but you make a fair point that
it is unlikely to occur that often.  + If another OS want's to do it wrong
that's their problem :)

J
Dan Williams July 19, 2022, 10:15 p.m. UTC | #5
Jonathan Cameron wrote:
> > No, I would prefer that as far as the Linux implementation is concerned
> > the software-guide does not exist. In the sense that the Linux
> > implementation choices supersede and are otherwise a superset of what
> > the guide recommends.
> 
> ah. I phrased that badly. I just meant lift the argument as a comment rather
> than a cross reference.

Oh, you mean promote it to an actual rationale comment rather than just
parrot what the code is doing? Yeah, that's a good idea.
Jonathan Cameron July 20, 2022, 9:59 a.m. UTC | #6
On Tue, 19 Jul 2022 15:15:16 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Jonathan Cameron wrote:
> > > No, I would prefer that as far as the Linux implementation is concerned
> > > the software-guide does not exist. In the sense that the Linux
> > > implementation choices supersede and are otherwise a superset of what
> > > the guide recommends.  
> > 
> > ah. I phrased that badly. I just meant lift the argument as a comment rather
> > than a cross reference.  
> 
> Oh, you mean promote it to an actual rationale comment rather than just
> parrot what the code is doing? Yeah, that's a good idea.

yup
diff mbox series

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-cxl b/Documentation/ABI/testing/sysfs-bus-cxl
index d30c95a758a9..46d5295c1149 100644
--- a/Documentation/ABI/testing/sysfs-bus-cxl
+++ b/Documentation/ABI/testing/sysfs-bus-cxl
@@ -273,3 +273,24 @@  Description:
 		(RW) Write a unique identifier for the region. This field must
 		be set for persistent regions and it must not conflict with the
 		UUID of another region.
+
+
+What:		/sys/bus/cxl/devices/regionZ/interleave_granularity
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Set the number of consecutive bytes each device in the
+		interleave set will claim. The possible interleave granularity
+		values are determined by the CXL spec and the participating
+		devices.
+
+
+What:		/sys/bus/cxl/devices/regionZ/interleave_ways
+Date:		May, 2022
+KernelVersion:	v5.20
+Contact:	linux-cxl@vger.kernel.org
+Description:
+		(RW) Configures the number of devices participating in the
+		region is set by writing this value. Each device will provide
+		1/interleave_ways of storage for the region.
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index f75978f846b9..78af42454760 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -7,6 +7,7 @@ 
 #include <linux/slab.h>
 #include <linux/uuid.h>
 #include <linux/idr.h>
+#include <cxlmem.h>
 #include <cxl.h>
 #include "core.h"
 
@@ -21,6 +22,8 @@ 
  *
  * Region configuration has ordering constraints. UUID may be set at any time
  * but is only visible for persistent regions.
+ * 1. Interleave granularity
+ * 2. Interleave size
  */
 
 /*
@@ -119,8 +122,129 @@  static umode_t cxl_region_visible(struct kobject *kobj, struct attribute *a,
 	return a->mode;
 }
 
+static ssize_t interleave_ways_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->interleave_ways);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static ssize_t interleave_ways_store(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int rc, val;
+	u8 iw;
+
+	rc = kstrtoint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = ways_to_cxl(val, &iw);
+	if (rc)
+		return rc;
+
+	/*
+	 * Even for x3, x9, and x12 interleaves the region interleave must be a
+	 * power of 2 multiple of the host bridge interleave.
+	 */
+	if (!is_power_of_2(val / cxld->interleave_ways) ||
+	    (val % cxld->interleave_ways)) {
+		dev_dbg(&cxlr->dev, "invalid interleave: %d\n", val);
+		return -EINVAL;
+	}
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	p->interleave_ways = val;
+out:
+	up_read(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(interleave_ways);
+
+static ssize_t interleave_granularity_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	ssize_t rc;
+
+	rc = down_read_interruptible(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	rc = sysfs_emit(buf, "%d\n", p->interleave_granularity);
+	up_read(&cxl_region_rwsem);
+
+	return rc;
+}
+
+static ssize_t interleave_granularity_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t len)
+{
+	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev->parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region *cxlr = to_cxl_region(dev);
+	struct cxl_region_params *p = &cxlr->params;
+	int rc, val;
+	u16 ig;
+
+	rc = kstrtoint(buf, 0, &val);
+	if (rc)
+		return rc;
+
+	rc = granularity_to_cxl(val, &ig);
+	if (rc)
+		return rc;
+
+	/* region granularity must be >= root granularity */
+	if (val < cxld->interleave_granularity)
+		return -EINVAL;
+
+	rc = down_write_killable(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	if (p->state >= CXL_CONFIG_INTERLEAVE_ACTIVE) {
+		rc = -EBUSY;
+		goto out;
+	}
+
+	p->interleave_granularity = val;
+out:
+	up_read(&cxl_region_rwsem);
+	if (rc)
+		return rc;
+	return len;
+}
+static DEVICE_ATTR_RW(interleave_granularity);
+
 static struct attribute *cxl_region_attrs[] = {
 	&dev_attr_uuid.attr,
+	&dev_attr_interleave_ways.attr,
+	&dev_attr_interleave_granularity.attr,
 	NULL,
 };
 
@@ -212,6 +336,8 @@  static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
 					      enum cxl_decoder_type type)
 {
 	struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+	struct cxl_decoder *cxld = &cxlrd->cxlsd.cxld;
+	struct cxl_region_params *p;
 	struct cxl_region *cxlr;
 	struct device *dev;
 	int rc;
@@ -219,8 +345,10 @@  static struct cxl_region *devm_cxl_add_region(struct cxl_root_decoder *cxlrd,
 	cxlr = cxl_region_alloc(cxlrd, id);
 	if (IS_ERR(cxlr))
 		return cxlr;
+	p = &cxlr->params;
 	cxlr->mode = mode;
 	cxlr->type = type;
+	p->interleave_granularity = cxld->interleave_granularity;
 
 	dev = &cxlr->dev;
 	rc = dev_set_name(dev, "region%d", id);
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 46a9f8acc602..13ee04b00e0c 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -7,6 +7,7 @@ 
 #include <linux/libnvdimm.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
+#include <linux/log2.h>
 #include <linux/io.h>
 
 /**
@@ -92,6 +93,31 @@  static inline int cxl_to_ways(u8 eniw, unsigned int *val)
 	return 0;
 }
 
+static inline int granularity_to_cxl(int g, u16 *ig)
+{
+	if (g > SZ_16K || g < 256 || !is_power_of_2(g))
+		return -EINVAL;
+	*ig = ilog2(g) - 8;
+	return 0;
+}
+
+static inline int ways_to_cxl(int ways, u8 *iw)
+{
+	if (ways > 16)
+		return -EINVAL;
+	if (is_power_of_2(ways)) {
+		*iw = ilog2(ways);
+		return 0;
+	}
+	if (ways % 3)
+		return -EINVAL;
+	ways /= 3;
+	if (!is_power_of_2(ways))
+		return -EINVAL;
+	*iw = ilog2(ways) + 8;
+	return 0;
+}
+
 /* CXL 2.0 8.2.8.1 Device Capabilities Array Register */
 #define CXLDEV_CAP_ARRAY_OFFSET 0x0
 #define   CXLDEV_CAP_ARRAY_CAP_ID 0
@@ -291,11 +317,14 @@  struct cxl_root_decoder {
 /*
  * enum cxl_config_state - State machine for region configuration
  * @CXL_CONFIG_IDLE: Any sysfs attribute can be written freely
+ * @CXL_CONFIG_INTERLEAVE_ACTIVE: region size has been set, no more
+ * changes to interleave_ways or interleave_granularity
  * @CXL_CONFIG_ACTIVE: All targets have been added the region is now
  * active
  */
 enum cxl_config_state {
 	CXL_CONFIG_IDLE,
+	CXL_CONFIG_INTERLEAVE_ACTIVE,
 	CXL_CONFIG_ACTIVE,
 };
 
@@ -303,12 +332,16 @@  enum cxl_config_state {
  * struct cxl_region_params - region settings
  * @state: allow the driver to lockdown further parameter changes
  * @uuid: unique id for persistent regions
+ * @interleave_ways: number of endpoints in the region
+ * @interleave_granularity: capacity each endpoint contributes to a stripe
  *
  * State transitions are protected by the cxl_region_rwsem
  */
 struct cxl_region_params {
 	enum cxl_config_state state;
 	uuid_t uuid;
+	int interleave_ways;
+	int interleave_granularity;
 };
 
 /**