diff mbox series

cxl: Fix sysfs export of qos_class for memdev

Message ID 170612211746.2737788.8759299841309402825.stgit@djiang5-mobl3
State New, archived
Headers show
Series cxl: Fix sysfs export of qos_class for memdev | expand

Commit Message

Dave Jiang Jan. 24, 2024, 6:48 p.m. UTC
Current implementation exports only to
/sys/bus/cxl/devices/.../memN/qos_class. With both ram and pmem exposed,
the second registered sysfs attribute is rejected as duplicate. It's not
possible to create qos_class under the dev_groups via the driver due to
the ram and pmem sysfs sub-directories already created by the device sysfs
groups. Move the ram and pmem qos_class to the device sysfs groups and add
a call to sysfs_update() after the perf data are validated so the
qos_class can be visible. The end results should be
/sys/bus/cxl/devices/.../memN/ram/qos_class and
/sys/bus/cxl/devices/.../memN/pmem/qos_class.

Fixes: 42834b17cf1f ("cxl: Export sysfs attributes for memory device QoS class")
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
---
 drivers/cxl/core/cdat.c   |   12 ++++++
 drivers/cxl/core/core.h   |    3 ++
 drivers/cxl/core/memdev.c |   86 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/cxl/mem.c         |   56 -----------------------------
 4 files changed, 101 insertions(+), 56 deletions(-)

Comments

Dan Williams Jan. 25, 2024, 6:33 a.m. UTC | #1
Dave Jiang wrote:
> Current implementation exports only to
> /sys/bus/cxl/devices/.../memN/qos_class. 

Documentation/ABI/testing/sysfs-bus-cxl says they should be under ram/
and pmem/ but I should have required a cxl_test verification of that basic
detail. This style of ABI sanity checking is cxl_test's main reason for
existing. Especially for scenarios like this that may not exist in
currently available hardware.

> With both ram and pmem exposed,

Luckily this is not a common occurrence in the wild with real devices,
but the amount of fixes needed here to make the implementation match the
documentation are threatening to be too big to fix in a -rc. See below:

> the second registered sysfs attribute is rejected as duplicate. It's not
> possible to create qos_class under the dev_groups via the driver due to
> the ram and pmem sysfs sub-directories already created by the device sysfs
> groups. Move the ram and pmem qos_class to the device sysfs groups and add
> a call to sysfs_update() after the perf data are validated so the
> qos_class can be visible. The end results should be
> /sys/bus/cxl/devices/.../memN/ram/qos_class and
> /sys/bus/cxl/devices/.../memN/pmem/qos_class.
> 
> Fixes: 42834b17cf1f ("cxl: Export sysfs attributes for memory device QoS class")
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>

Recall that I was dubious about the list_head usage and the hard to
discern locking rules. This patch pushes that over the edge. The old
locking model was that the lists were populated under the endpoint-port
device_lock() and the reads of them in memdev sysfs were gated by the
fact that they were guaranteed to be settled before the cxl_mem driver
dev_groups were populated.

Now they are moved out of that context, even that undocumented locking
rule is violated. I believe you can hit this race by a loop that does
cxl_mem bind/unbind while reading the qos_class. Even if that does not
trigger, the locking here is subtle and non-obvious which usually means 
broken.

In the end, only the first verified 'struct cxl_dpa_perf' instance per
partition matters, and *that* is what should be stored in the
cxl_memdev_state post init, not the lists.

Have a look at just caching that see if we can make this fix small
enough to be -rc material.

More comments below:

>  drivers/cxl/core/cdat.c   |   12 ++++++
>  drivers/cxl/core/core.h   |    3 ++
>  drivers/cxl/core/memdev.c |   86 +++++++++++++++++++++++++++++++++++++++++++++
>  drivers/cxl/mem.c         |   56 -----------------------------
>  4 files changed, 101 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
> index 6fe11546889f..a47dc4102140 100644
> --- a/drivers/cxl/core/cdat.c
> +++ b/drivers/cxl/core/cdat.c
> @@ -395,6 +395,7 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port)
>  {
>  	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
>  	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
>  	struct xarray __dsmas_xa;
>  	struct xarray *dsmas_xa __free(dsmas) = &__dsmas_xa;
>  	int rc;
> @@ -417,6 +418,17 @@ void cxl_endpoint_parse_cdat(struct cxl_port *port)
>  
>  	cxl_memdev_set_qos_class(cxlds, dsmas_xa);
>  	cxl_qos_class_verify(cxlmd);
> +	if (!list_empty(&mds->ram_perf_list)) {
> +		rc = sysfs_update_group(&cxlmd->dev.kobj, get_cxl_memdev_ram_group());
> +		if (rc)
> +			dev_dbg(&cxlmd->dev, "Failed to update RAM QoS group\n");
> +	}
> +
> +	if (!list_empty(&mds->pmem_perf_list)) {
> +		rc = sysfs_update_group(&cxlmd->dev.kobj, get_cxl_memdev_pmem_group());
> +		if (rc)
> +			dev_dbg(&cxlmd->dev, "Failed to update PMEM QoS group\n");
> +	}

Setting aside that this does not need getter functions and can just link
the attribute_group symbols directly, just move this logic into
memdev.c.  I.e. create a cxl_memdev_attrs_update() that just
unconditionally updates all attribute groups that should retrigger their
is_visible() routines.

Note that this needs to be retriggered on the way down to make the now
defunct qos_class attributes disappear.

>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL);
>  
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 3b64fb1b9ed0..44958be2178a 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -90,4 +90,7 @@ enum cxl_poison_trace_type {
>  
>  long cxl_pci_get_latency(struct pci_dev *pdev);
>  
> +const struct attribute_group *get_cxl_memdev_ram_group(void);
> +const struct attribute_group *get_cxl_memdev_pmem_group(void);
> +
>  #endif /* __CXL_CORE_H__ */
> diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
> index dae8802ecdb0..5743c6b2710c 100644
> --- a/drivers/cxl/core/memdev.c
> +++ b/drivers/cxl/core/memdev.c
> @@ -447,13 +447,61 @@ static struct attribute *cxl_memdev_attributes[] = {
>  	NULL,
>  };
>  
> +static ssize_t pmem_qos_class_show(struct device *dev,
> +				   struct device_attribute *attr, char *buf)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
> +	struct cxl_dpa_perf *dpa_perf;
> +
> +	if (!dev->driver)
> +		return -ENOENT;

This dev->driver check was pointless when this was a driver dev_group
because a driver dev_group is only published while dev->driver is set.

Now that it is moved to a static memdev attribute that check is racy
without the device_lock().

Side comment, -ENOENT is a surprising error condition for a file that a
was successfully found an opened. It was not a problem before because it
was impossible to trigger.

> +	if (list_empty(&mds->pmem_perf_list))
> +		return -ENOENT;

memdev attributes depend on information being stable before the memdev
is registered. This attribute violates that perf information is only
valid while the mem driver is attached.

I am thinking that if cxl_memdev_state only caches the single qos class
per partition like this:

diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h
index 5303d6942b88..9b4a26d3acf8 100644
--- a/drivers/cxl/cxlmem.h
+++ b/drivers/cxl/cxlmem.h
@@ -494,8 +494,8 @@ struct cxl_memdev_state {
        u64 next_volatile_bytes;
        u64 next_persistent_bytes;
 
-       struct list_head ram_perf_list;
-       struct list_head pmem_perf_list;
+       struct cxl_dpa_perf ram_perf;
+       struct cxl_dpa_perf pmem_perf;
 
        struct cxl_event_state event;
        struct cxl_poison_state poison;

...then this attribute can locklessly do:

	qos_class = mds->pmem_perf.qos_class;

	if (qos_class < 0)
		return -ENXIO;

Yes, userspace can race and see false positive good values before
sysfs_update_group() can hide this attribute, but I think that's
unavoidable. I think false negatives on the way up is precluded by the
attribute being invisible during that time.

> +	dpa_perf = list_first_entry(&mds->pmem_perf_list, struct cxl_dpa_perf,
> +				    list);
> +
> +	return sysfs_emit(buf, "%d\n", dpa_perf->qos_class);
> +}
> +
> +static struct device_attribute dev_attr_pmem_qos_class =
> +	__ATTR(qos_class, 0444, pmem_qos_class_show, NULL);
> +
>  static struct attribute *cxl_memdev_pmem_attributes[] = {
>  	&dev_attr_pmem_size.attr,
> +	&dev_attr_pmem_qos_class.attr,
>  	NULL,
>  };
>  
> +static ssize_t ram_qos_class_show(struct device *dev,
> +				  struct device_attribute *attr, char *buf)
> +{
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
> +	struct cxl_dpa_perf *dpa_perf;
> +
> +	if (!dev->driver)
> +		return -ENOENT;
> +
> +	if (list_empty(&mds->ram_perf_list))
> +		return -ENOENT;
> +
> +	dpa_perf = list_first_entry(&mds->ram_perf_list, struct cxl_dpa_perf,
> +				    list);
> +
> +	return sysfs_emit(buf, "%d\n", dpa_perf->qos_class);
> +}
> +
> +static struct device_attribute dev_attr_ram_qos_class =
> +	__ATTR(qos_class, 0444, ram_qos_class_show, NULL);
> +
>  static struct attribute *cxl_memdev_ram_attributes[] = {
>  	&dev_attr_ram_size.attr,
> +	&dev_attr_ram_qos_class.attr,
>  	NULL,
>  };
>  
> @@ -477,16 +525,54 @@ static struct attribute_group cxl_memdev_attribute_group = {
>  	.is_visible = cxl_memdev_visible,
>  };
>  
> +static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n)
> +{
> +	struct device *dev = kobj_to_dev(kobj);
> +	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
> +
> +	if (a == &dev_attr_ram_qos_class.attr)
> +		if (list_empty(&mds->ram_perf_list))
> +			return 0;
> +
> +	return a->mode;
> +}
> +
>  static struct attribute_group cxl_memdev_ram_attribute_group = {
>  	.name = "ram",
>  	.attrs = cxl_memdev_ram_attributes,
> +	.is_visible = cxl_ram_visible,
>  };
>  
> +const struct attribute_group *get_cxl_memdev_ram_group(void)
> +{
> +	return &cxl_memdev_ram_attribute_group;

Could have just put &cxl_memdev_ram_attribute_group directly where this
is called, but that's moot now with the cxl_memdev_attr_update()
proposal.
diff mbox series

Patch

diff --git a/drivers/cxl/core/cdat.c b/drivers/cxl/core/cdat.c
index 6fe11546889f..a47dc4102140 100644
--- a/drivers/cxl/core/cdat.c
+++ b/drivers/cxl/core/cdat.c
@@ -395,6 +395,7 @@  void cxl_endpoint_parse_cdat(struct cxl_port *port)
 {
 	struct cxl_memdev *cxlmd = to_cxl_memdev(port->uport_dev);
 	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
 	struct xarray __dsmas_xa;
 	struct xarray *dsmas_xa __free(dsmas) = &__dsmas_xa;
 	int rc;
@@ -417,6 +418,17 @@  void cxl_endpoint_parse_cdat(struct cxl_port *port)
 
 	cxl_memdev_set_qos_class(cxlds, dsmas_xa);
 	cxl_qos_class_verify(cxlmd);
+	if (!list_empty(&mds->ram_perf_list)) {
+		rc = sysfs_update_group(&cxlmd->dev.kobj, get_cxl_memdev_ram_group());
+		if (rc)
+			dev_dbg(&cxlmd->dev, "Failed to update RAM QoS group\n");
+	}
+
+	if (!list_empty(&mds->pmem_perf_list)) {
+		rc = sysfs_update_group(&cxlmd->dev.kobj, get_cxl_memdev_pmem_group());
+		if (rc)
+			dev_dbg(&cxlmd->dev, "Failed to update PMEM QoS group\n");
+	}
 }
 EXPORT_SYMBOL_NS_GPL(cxl_endpoint_parse_cdat, CXL);
 
diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 3b64fb1b9ed0..44958be2178a 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -90,4 +90,7 @@  enum cxl_poison_trace_type {
 
 long cxl_pci_get_latency(struct pci_dev *pdev);
 
+const struct attribute_group *get_cxl_memdev_ram_group(void);
+const struct attribute_group *get_cxl_memdev_pmem_group(void);
+
 #endif /* __CXL_CORE_H__ */
diff --git a/drivers/cxl/core/memdev.c b/drivers/cxl/core/memdev.c
index dae8802ecdb0..5743c6b2710c 100644
--- a/drivers/cxl/core/memdev.c
+++ b/drivers/cxl/core/memdev.c
@@ -447,13 +447,61 @@  static struct attribute *cxl_memdev_attributes[] = {
 	NULL,
 };
 
+static ssize_t pmem_qos_class_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	struct cxl_dpa_perf *dpa_perf;
+
+	if (!dev->driver)
+		return -ENOENT;
+
+	if (list_empty(&mds->pmem_perf_list))
+		return -ENOENT;
+
+	dpa_perf = list_first_entry(&mds->pmem_perf_list, struct cxl_dpa_perf,
+				    list);
+
+	return sysfs_emit(buf, "%d\n", dpa_perf->qos_class);
+}
+
+static struct device_attribute dev_attr_pmem_qos_class =
+	__ATTR(qos_class, 0444, pmem_qos_class_show, NULL);
+
 static struct attribute *cxl_memdev_pmem_attributes[] = {
 	&dev_attr_pmem_size.attr,
+	&dev_attr_pmem_qos_class.attr,
 	NULL,
 };
 
+static ssize_t ram_qos_class_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_dev_state *cxlds = cxlmd->cxlds;
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
+	struct cxl_dpa_perf *dpa_perf;
+
+	if (!dev->driver)
+		return -ENOENT;
+
+	if (list_empty(&mds->ram_perf_list))
+		return -ENOENT;
+
+	dpa_perf = list_first_entry(&mds->ram_perf_list, struct cxl_dpa_perf,
+				    list);
+
+	return sysfs_emit(buf, "%d\n", dpa_perf->qos_class);
+}
+
+static struct device_attribute dev_attr_ram_qos_class =
+	__ATTR(qos_class, 0444, ram_qos_class_show, NULL);
+
 static struct attribute *cxl_memdev_ram_attributes[] = {
 	&dev_attr_ram_size.attr,
+	&dev_attr_ram_qos_class.attr,
 	NULL,
 };
 
@@ -477,16 +525,54 @@  static struct attribute_group cxl_memdev_attribute_group = {
 	.is_visible = cxl_memdev_visible,
 };
 
+static umode_t cxl_ram_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+	if (a == &dev_attr_ram_qos_class.attr)
+		if (list_empty(&mds->ram_perf_list))
+			return 0;
+
+	return a->mode;
+}
+
 static struct attribute_group cxl_memdev_ram_attribute_group = {
 	.name = "ram",
 	.attrs = cxl_memdev_ram_attributes,
+	.is_visible = cxl_ram_visible,
 };
 
+const struct attribute_group *get_cxl_memdev_ram_group(void)
+{
+	return &cxl_memdev_ram_attribute_group;
+}
+
+static umode_t cxl_pmem_visible(struct kobject *kobj, struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
+	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlmd->cxlds);
+
+	if (a == &dev_attr_pmem_qos_class.attr)
+		if (list_empty(&mds->pmem_perf_list))
+			return 0;
+
+	return a->mode;
+}
+
 static struct attribute_group cxl_memdev_pmem_attribute_group = {
 	.name = "pmem",
 	.attrs = cxl_memdev_pmem_attributes,
+	.is_visible = cxl_pmem_visible,
 };
 
+const struct attribute_group *get_cxl_memdev_pmem_group(void)
+{
+	return &cxl_memdev_pmem_attribute_group;
+}
+
 static umode_t cxl_memdev_security_visible(struct kobject *kobj,
 					   struct attribute *a, int n)
 {
diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
index c5c9d8e0d88d..0c79d9ce877c 100644
--- a/drivers/cxl/mem.c
+++ b/drivers/cxl/mem.c
@@ -215,52 +215,6 @@  static ssize_t trigger_poison_list_store(struct device *dev,
 }
 static DEVICE_ATTR_WO(trigger_poison_list);
 
-static ssize_t ram_qos_class_show(struct device *dev,
-				  struct device_attribute *attr, char *buf)
-{
-	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_dev_state *cxlds = cxlmd->cxlds;
-	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
-	struct cxl_dpa_perf *dpa_perf;
-
-	if (!dev->driver)
-		return -ENOENT;
-
-	if (list_empty(&mds->ram_perf_list))
-		return -ENOENT;
-
-	dpa_perf = list_first_entry(&mds->ram_perf_list, struct cxl_dpa_perf,
-				    list);
-
-	return sysfs_emit(buf, "%d\n", dpa_perf->qos_class);
-}
-
-static struct device_attribute dev_attr_ram_qos_class =
-	__ATTR(qos_class, 0444, ram_qos_class_show, NULL);
-
-static ssize_t pmem_qos_class_show(struct device *dev,
-				   struct device_attribute *attr, char *buf)
-{
-	struct cxl_memdev *cxlmd = to_cxl_memdev(dev);
-	struct cxl_dev_state *cxlds = cxlmd->cxlds;
-	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
-	struct cxl_dpa_perf *dpa_perf;
-
-	if (!dev->driver)
-		return -ENOENT;
-
-	if (list_empty(&mds->pmem_perf_list))
-		return -ENOENT;
-
-	dpa_perf = list_first_entry(&mds->pmem_perf_list, struct cxl_dpa_perf,
-				    list);
-
-	return sysfs_emit(buf, "%d\n", dpa_perf->qos_class);
-}
-
-static struct device_attribute dev_attr_pmem_qos_class =
-	__ATTR(qos_class, 0444, pmem_qos_class_show, NULL);
-
 static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n)
 {
 	struct device *dev = kobj_to_dev(kobj);
@@ -272,21 +226,11 @@  static umode_t cxl_mem_visible(struct kobject *kobj, struct attribute *a, int n)
 			      mds->poison.enabled_cmds))
 			return 0;
 
-	if (a == &dev_attr_pmem_qos_class.attr)
-		if (list_empty(&mds->pmem_perf_list))
-			return 0;
-
-	if (a == &dev_attr_ram_qos_class.attr)
-		if (list_empty(&mds->ram_perf_list))
-			return 0;
-
 	return a->mode;
 }
 
 static struct attribute *cxl_mem_attrs[] = {
 	&dev_attr_trigger_poison_list.attr,
-	&dev_attr_ram_qos_class.attr,
-	&dev_attr_pmem_qos_class.attr,
 	NULL
 };