diff mbox series

cxl: Update Soft Reserved resources upon region creation

Message ID 20241202155542.22111-1-nathan.fontenot@amd.com (mailing list archive)
State New
Headers show
Series cxl: Update Soft Reserved resources upon region creation | expand

Commit Message

Nathan Fontenot Dec. 2, 2024, 3:55 p.m. UTC
Update handling of SOFT RESERVE iomem resources that intersect with
CXL region resources to remove the intersections from the SOFT RESERVE
resources. The current approach of leaving the SOFT RESERVE
resource as is can cause failures during hotplug replace of CXL
devices because the resource is not available for reuse after
teardown of the CXL device.

The approach is to trim out any pieces of SOFT RESERVE resources
that intersect CXL regions. To do this, first set aside any SOFT RESERVE
resources that intersect with a CFMWS into a separate resource tree
during e820__reserve_resources_late() that would have been otherwise
added to the iomem resource tree.

As CXL regions are created the cxl resource created for the new
region is used to trim intersections from the SOFT RESERVE
resources that were previously set aside.

Once CXL device probe has completed ant remaining SOFT RESERVE resources
remaining are added to the iomem resource tree. As each resource
is added to the oiomem resource tree a new notifier chain is invoked
to notify the dax driver of newly added SOFT RESERVE resources so that
the dax driver can consume them.

Signed-off-by: Nathan Fontenot <nathan.fontenot@amd.com>
---
 arch/x86/kernel/e820.c    |  17 ++++-
 drivers/cxl/core/region.c |   8 +-
 drivers/cxl/port.c        |  15 ++++
 drivers/dax/hmem/device.c |  13 ++--
 drivers/dax/hmem/hmem.c   |  15 ++++
 drivers/dax/hmem/hmem.h   |  11 +++
 include/linux/dax.h       |   4 -
 include/linux/ioport.h    |   6 ++
 kernel/resource.c         | 155 +++++++++++++++++++++++++++++++++++++-
 9 files changed, 229 insertions(+), 15 deletions(-)
 create mode 100644 drivers/dax/hmem/hmem.h

Comments

Fan Ni Dec. 2, 2024, 6:56 p.m. UTC | #1
On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
> Update handling of SOFT RESERVE iomem resources that intersect with
> CXL region resources to remove the intersections from the SOFT RESERVE
> resources. The current approach of leaving the SOFT RESERVE
> resource as is can cause failures during hotplug replace of CXL
> devices because the resource is not available for reuse after
> teardown of the CXL device.
> 
> The approach is to trim out any pieces of SOFT RESERVE resources
> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
> resources that intersect with a CFMWS into a separate resource tree
> during e820__reserve_resources_late() that would have been otherwise
> added to the iomem resource tree.
> 
> As CXL regions are created the cxl resource created for the new
> region is used to trim intersections from the SOFT RESERVE
> resources that were previously set aside.
> 
> Once CXL device probe has completed ant remaining SOFT RESERVE resources
/ant/any/
> remaining are added to the iomem resource tree. As each resource
> is added to the oiomem resource tree a new notifier chain is invoked
/oiomem/iomem/
> to notify the dax driver of newly added SOFT RESERVE resources so that
> the dax driver can consume them.

In general, the patch is kind of complicated and hard to review for me.
I am wondering if it can be broken down to make it easier to
review.

One minor thing inline.

> 
> Signed-off-by: Nathan Fontenot <nathan.fontenot@amd.com>
> ---
>  arch/x86/kernel/e820.c    |  17 ++++-
>  drivers/cxl/core/region.c |   8 +-
>  drivers/cxl/port.c        |  15 ++++
>  drivers/dax/hmem/device.c |  13 ++--
>  drivers/dax/hmem/hmem.c   |  15 ++++
>  drivers/dax/hmem/hmem.h   |  11 +++
>  include/linux/dax.h       |   4 -
>  include/linux/ioport.h    |   6 ++
>  kernel/resource.c         | 155 +++++++++++++++++++++++++++++++++++++-
>  9 files changed, 229 insertions(+), 15 deletions(-)
>  create mode 100644 drivers/dax/hmem/hmem.h
> 
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 4893d30ce438..cab82e9324a5 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -1210,14 +1210,23 @@ static unsigned long __init ram_alignment(resource_size_t pos)
>  
>  void __init e820__reserve_resources_late(void)
>  {
> -	int i;
>  	struct resource *res;
> +	int i;
>  
> +	/*
> +	 * Prior to inserting SOFT_RESERVED resources we want to check for an
> +	 * intersection with potential CXL resources. Any SOFT_RESERVED resources
> +	 * that do intersect a potential CXL resource are set aside so they
> +	 * can be trimmed to accommodate CXL resource intersections and added to
> +	 * the iomem resource tree after the CXL drivers have completed their
> +	 * device probe.
> +	 */
>  	res = e820_res;
> -	for (i = 0; i < e820_table->nr_entries; i++) {
> -		if (!res->parent && res->end)
> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {
> +		if (res->desc == IORES_DESC_SOFT_RESERVED)
> +			insert_soft_reserve_resource(res);
> +		else if (!res->parent && res->end)
>  			insert_resource_expand_to_fit(&iomem_resource, res);
> -		res++;
Maybe we can keep the original (res++) here to avoid the noise since it is a
style thing and does not affect what we want to achieve. 

Fan
>  	}
>  
>  	/*
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 21ad5f242875..c458a6313b31 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3226,6 +3226,12 @@ static int match_region_by_range(struct device *dev, void *data)
>  	return rc;
>  }
>  
> +static int insert_region_resource(struct resource *parent, struct resource *res)
> +{
> +	trim_soft_reserve_resources(res);
> +	return insert_resource(parent, res);
> +}
> +
>  /* Establish an empty region covering the given HPA range */
>  static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>  					   struct cxl_endpoint_decoder *cxled)
> @@ -3272,7 +3278,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>  
>  	*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
>  				    dev_name(&cxlr->dev));
> -	rc = insert_resource(cxlrd->res, res);
> +	rc = insert_region_resource(cxlrd->res, res);
>  	if (rc) {
>  		/*
>  		 * Platform-firmware may not have split resources like "System
> diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
> index d7d5d982ce69..4461f2a80d72 100644
> --- a/drivers/cxl/port.c
> +++ b/drivers/cxl/port.c
> @@ -89,6 +89,20 @@ static int cxl_switch_port_probe(struct cxl_port *port)
>  	return -ENXIO;
>  }
>  
> +static void cxl_sr_update(struct work_struct *w)
> +{
> +	merge_soft_reserve_resources();
> +}
> +
> +DECLARE_DELAYED_WORK(cxl_sr_work, cxl_sr_update);
> +
> +static void schedule_soft_reserve_update(void)
> +{
> +	int timeout = 5 * HZ;
> +
> +	mod_delayed_work(system_wq, &cxl_sr_work, timeout);
> +}
> +
>  static int cxl_endpoint_port_probe(struct cxl_port *port)
>  {
>  	struct cxl_endpoint_dvsec_info info = { .port = port };
> @@ -140,6 +154,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
>  	 */
>  	device_for_each_child(&port->dev, root, discover_region);
>  
> +	schedule_soft_reserve_update();
>  	return 0;
>  }
>  
> diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
> index f9e1a76a04a9..c45791ad4858 100644
> --- a/drivers/dax/hmem/device.c
> +++ b/drivers/dax/hmem/device.c
> @@ -4,6 +4,7 @@
>  #include <linux/module.h>
>  #include <linux/dax.h>
>  #include <linux/mm.h>
> +#include "hmem.h"
>  
>  static bool nohmem;
>  module_param_named(disable, nohmem, bool, 0444);
> @@ -17,6 +18,9 @@ static struct resource hmem_active = {
>  	.flags = IORESOURCE_MEM,
>  };
>  
> +struct platform_device *hmem_pdev;
> +EXPORT_SYMBOL_GPL(hmem_pdev);
> +
>  int walk_hmem_resources(struct device *host, walk_hmem_fn fn)
>  {
>  	struct resource *res;
> @@ -35,7 +39,6 @@ EXPORT_SYMBOL_GPL(walk_hmem_resources);
>  
>  static void __hmem_register_resource(int target_nid, struct resource *res)
>  {
> -	struct platform_device *pdev;
>  	struct resource *new;
>  	int rc;
>  
> @@ -51,15 +54,15 @@ static void __hmem_register_resource(int target_nid, struct resource *res)
>  	if (platform_initialized)
>  		return;
>  
> -	pdev = platform_device_alloc("hmem_platform", 0);
> -	if (!pdev) {
> +	hmem_pdev = platform_device_alloc("hmem_platform", 0);
> +	if (!hmem_pdev) {
>  		pr_err_once("failed to register device-dax hmem_platform device\n");
>  		return;
>  	}
>  
> -	rc = platform_device_add(pdev);
> +	rc = platform_device_add(hmem_pdev);
>  	if (rc)
> -		platform_device_put(pdev);
> +		platform_device_put(hmem_pdev);
>  	else
>  		platform_initialized = true;
>  }
> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
> index 5e7c53f18491..d626b60a9716 100644
> --- a/drivers/dax/hmem/hmem.c
> +++ b/drivers/dax/hmem/hmem.c
> @@ -5,6 +5,7 @@
>  #include <linux/pfn_t.h>
>  #include <linux/dax.h>
>  #include "../bus.h"
> +#include "hmem.h"
>  
>  static bool region_idle;
>  module_param_named(region_idle, region_idle, bool, 0644);
> @@ -123,8 +124,22 @@ static int hmem_register_device(struct device *host, int target_nid,
>  	return rc;
>  }
>  
> +static int dax_hmem_cb(struct notifier_block *nb, unsigned long action,
> +		       void *arg)
> +{
> +	struct resource *res = arg;
> +
> +	return hmem_register_device(&hmem_pdev->dev,
> +				    phys_to_target_node(res->start), res);
> +}
> +
> +static struct notifier_block hmem_nb = {
> +	.notifier_call = dax_hmem_cb
> +};
> +
>  static int dax_hmem_platform_probe(struct platform_device *pdev)
>  {
> +	register_soft_reserve_notifier(&hmem_nb);
>  	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>  }
>  
> diff --git a/drivers/dax/hmem/hmem.h b/drivers/dax/hmem/hmem.h
> new file mode 100644
> index 000000000000..95583b59cef7
> --- /dev/null
> +++ b/drivers/dax/hmem/hmem.h
> @@ -0,0 +1,11 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#ifndef _HMEM_H
> +#define _HMEM_H
> +
> +typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
> +			    const struct resource *res);
> +int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
> +
> +extern struct platform_device *hmem_pdev;
> +
> +#endif
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 9d3e3327af4c..119b4e27a592 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -282,8 +282,4 @@ static inline void hmem_register_resource(int target_nid, struct resource *r)
>  {
>  }
>  #endif
> -
> -typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
> -			    const struct resource *res);
> -int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
>  #endif
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index 6e9fb667a1c5..487371a46392 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -14,6 +14,7 @@
>  #include <linux/compiler.h>
>  #include <linux/minmax.h>
>  #include <linux/types.h>
> +#include <linux/notifier.h>
>  /*
>   * Resources are tree-like, allowing
>   * nesting etc..
> @@ -249,6 +250,11 @@ struct resource *lookup_resource(struct resource *root, resource_size_t start);
>  int adjust_resource(struct resource *res, resource_size_t start,
>  		    resource_size_t size);
>  resource_size_t resource_alignment(struct resource *res);
> +extern void trim_soft_reserve_resources(const struct resource *res);
> +extern void merge_soft_reserve_resources(void);
> +extern int insert_soft_reserve_resource(struct resource *res);
> +extern int register_soft_reserve_notifier(struct notifier_block *nb);
> +extern int unregister_soft_reserve_notifier(struct notifier_block *nb);
>  static inline resource_size_t resource_size(const struct resource *res)
>  {
>  	return res->end - res->start + 1;
> diff --git a/kernel/resource.c b/kernel/resource.c
> index a83040fde236..8fc4121a1887 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -30,7 +30,7 @@
>  #include <linux/string.h>
>  #include <linux/vmalloc.h>
>  #include <asm/io.h>
> -
> +#include <linux/acpi.h>
>  
>  struct resource ioport_resource = {
>  	.name	= "PCI IO",
> @@ -48,7 +48,15 @@ struct resource iomem_resource = {
>  };
>  EXPORT_SYMBOL(iomem_resource);
>  
> +struct resource srmem_resource = {
> +	.name	= "Soft Reserved mem",
> +	.start	= 0,
> +	.end	= -1,
> +	.flags	= IORESOURCE_MEM,
> +};
> +
>  static DEFINE_RWLOCK(resource_lock);
> +static DEFINE_RWLOCK(srmem_resource_lock);
>  
>  static struct resource *next_resource(struct resource *p, bool skip_children)
>  {
> @@ -1034,6 +1042,151 @@ int adjust_resource(struct resource *res, resource_size_t start,
>  }
>  EXPORT_SYMBOL(adjust_resource);
>  
> +static BLOCKING_NOTIFIER_HEAD(soft_reserve_chain);
> +
> +int register_soft_reserve_notifier(struct notifier_block *nb)
> +{
> +	return blocking_notifier_chain_register(&soft_reserve_chain, nb);
> +}
> +EXPORT_SYMBOL(register_soft_reserve_notifier);
> +
> +int unregister_soft_reserve_notifier(struct notifier_block *nb)
> +{
> +	return blocking_notifier_chain_unregister(&soft_reserve_chain, nb);
> +}
> +EXPORT_SYMBOL(unregister_soft_reserve_notifier);
> +
> +static int soft_reserve_notify(unsigned long val, void *v)
> +{
> +	struct resource *res = v;
> +
> +	pr_info("Adding Soft Reserve resource %pr\n", res);
> +	return blocking_notifier_call_chain(&soft_reserve_chain, val, v);
> +}
> +
> +static void trim_soft_reserve(struct resource *sr_res,
> +			      const struct resource *res)
> +{
> +	struct resource *new_res;
> +
> +	if (sr_res->start == res->start && sr_res->end == res->end) {
> +		release_resource(sr_res);
> +		free_resource(sr_res);
> +	} else if (sr_res->start == res->start) {
> +		WARN_ON(adjust_resource(sr_res, res->end + 1,
> +					sr_res->end - res->end));
> +	} else if (sr_res->end == res->end) {
> +		WARN_ON(adjust_resource(sr_res, sr_res->start,
> +					res->start - sr_res->start));
> +	} else {
> +		/*
> +		 * Adjust existing resource to cover the resource
> +		 * range prior to the range to be trimmed.
> +		 */
> +		adjust_resource(sr_res, sr_res->start,
> +				res->start - sr_res->start);
> +
> +		/*
> +		 * Add new resource to cover the resource range for
> +		 * the range after the range to be trimmed.
> +		 */
> +		new_res = alloc_resource(GFP_KERNEL);
> +		if (!new_res)
> +			return;
> +
> +		*new_res = DEFINE_RES_NAMED(res->end + 1, sr_res->end - res->end,
> +					    "Soft Reserved", sr_res->flags);
> +		new_res->desc = IORES_DESC_SOFT_RESERVED;
> +		insert_resource(&srmem_resource, new_res);
> +	}
> +}
> +
> +void trim_soft_reserve_resources(const struct resource *res)
> +{
> +	struct resource *sr_res;
> +
> +	write_lock(&srmem_resource_lock);
> +	for (sr_res = srmem_resource.child; sr_res; sr_res = sr_res->sibling) {
> +		if (resource_contains(sr_res, res)) {
> +			trim_soft_reserve(sr_res, res);
> +			break;
> +		}
> +	}
> +	write_unlock(&srmem_resource_lock);
> +}
> +EXPORT_SYMBOL(trim_soft_reserve_resources);
> +
> +void merge_soft_reserve_resources(void)
> +{
> +	struct resource *sr_res, *next;
> +
> +	write_lock(&srmem_resource_lock);
> +	for (sr_res = srmem_resource.child; sr_res; sr_res = next) {
> +		next = sr_res->sibling;
> +
> +		release_resource(sr_res);
> +		if (insert_resource(&iomem_resource, sr_res))
> +			pr_info("Could not add Soft Reserve %pr\n", sr_res);
> +		else
> +			soft_reserve_notify(0, sr_res);
> +	}
> +	write_unlock(&srmem_resource_lock);
> +}
> +EXPORT_SYMBOL(merge_soft_reserve_resources);
> +
> +struct srmem_arg {
> +	struct resource *res;
> +	int overlaps;
> +};
> +
> +static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
> +			     void *arg, const unsigned long unused)
> +{
> +	struct acpi_cedt_cfmws *cfmws;
> +	struct srmem_arg *args = arg;
> +	struct resource cfmws_res;
> +	struct resource *res;
> +
> +	res = args->res;
> +
> +	cfmws = (struct acpi_cedt_cfmws *)hdr;
> +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
> +				   cfmws->base_hpa + cfmws->window_size);
> +
> +	if (resource_overlaps(&cfmws_res, res)) {
> +		args->overlaps += 1;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static bool resource_overlaps_cfmws(struct resource *res)
> +{
> +	struct srmem_arg arg = {
> +		.res = res,
> +		.overlaps = 0
> +	};
> +
> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
> +
> +	if (arg.overlaps)
> +		return true;
> +
> +	return false;
> +}
> +
> +int insert_soft_reserve_resource(struct resource *res)
> +{
> +	if (resource_overlaps_cfmws(res)) {
> +		pr_info("Reserving Soft Reserve %pr\n", res);
> +		return insert_resource(&srmem_resource, res);
> +	}
> +
> +	return insert_resource(&iomem_resource, res);
> +}
> +EXPORT_SYMBOL(insert_soft_reserve_resource);
> +
>  static void __init
>  __reserve_region_with_split(struct resource *root, resource_size_t start,
>  			    resource_size_t end, const char *name)
> -- 
> 2.43.0
>
kernel test robot Dec. 2, 2024, 7 p.m. UTC | #2
Hi Nathan,

kernel test robot noticed the following build errors:

[auto build test ERROR on cxl/next]
[also build test ERROR on tip/x86/core]
[cannot apply to linus/master cxl/pending v6.13-rc1 next-20241128]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nathan-Fontenot/cxl-Update-Soft-Reserved-resources-upon-region-creation/20241202-235757
base:   https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git next
patch link:    https://lore.kernel.org/r/20241202155542.22111-1-nathan.fontenot%40amd.com
patch subject: [PATCH] cxl: Update Soft Reserved resources upon region creation
config: mips-ath79_defconfig (https://download.01.org/0day-ci/archive/20241203/202412030237.JDiP8yCU-lkp@intel.com/config)
compiler: mips-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241203/202412030237.JDiP8yCU-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412030237.JDiP8yCU-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

>> kernel/resource.c:1183:36: warning: 'union acpi_subtable_headers' declared inside parameter list will not be visible outside of this definition or declaration
    1183 | static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
         |                                    ^~~~~~~~~~~~~~~~~~~~~
   kernel/resource.c: In function 'resource_overlaps_cfmws':
>> kernel/resource.c:1212:9: error: implicit declaration of function 'acpi_table_parse_cedt'; did you mean 'acpi_table_parse'? [-Wimplicit-function-declaration]
    1212 |         acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
         |         ^~~~~~~~~~~~~~~~~~~~~
         |         acpi_table_parse


vim +1212 kernel/resource.c

  1182	
> 1183	static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
  1184				     void *arg, const unsigned long unused)
  1185	{
  1186		struct acpi_cedt_cfmws *cfmws;
  1187		struct srmem_arg *args = arg;
  1188		struct resource cfmws_res;
  1189		struct resource *res;
  1190	
  1191		res = args->res;
  1192	
  1193		cfmws = (struct acpi_cedt_cfmws *)hdr;
  1194		cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
  1195					   cfmws->base_hpa + cfmws->window_size);
  1196	
  1197		if (resource_overlaps(&cfmws_res, res)) {
  1198			args->overlaps += 1;
  1199			return 1;
  1200		}
  1201	
  1202		return 0;
  1203	}
  1204	
  1205	static bool resource_overlaps_cfmws(struct resource *res)
  1206	{
  1207		struct srmem_arg arg = {
  1208			.res = res,
  1209			.overlaps = 0
  1210		};
  1211	
> 1212		acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
  1213	
  1214		if (arg.overlaps)
  1215			return true;
  1216	
  1217		return false;
  1218	}
  1219
kernel test robot Dec. 3, 2024, 12:31 a.m. UTC | #3
Hi Nathan,

kernel test robot noticed the following build errors:

[auto build test ERROR on cxl/next]
[also build test ERROR on tip/x86/core]
[cannot apply to linus/master cxl/pending v6.13-rc1 next-20241128]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nathan-Fontenot/cxl-Update-Soft-Reserved-resources-upon-region-creation/20241202-235757
base:   https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git next
patch link:    https://lore.kernel.org/r/20241202155542.22111-1-nathan.fontenot%40amd.com
patch subject: [PATCH] cxl: Update Soft Reserved resources upon region creation
config: arm-randconfig-004-20241203 (https://download.01.org/0day-ci/archive/20241203/202412030858.l8DbOnRQ-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 592c0fe55f6d9a811028b5f3507be91458ab2713)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241203/202412030858.l8DbOnRQ-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412030858.l8DbOnRQ-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   In file included from kernel/resource.c:21:
   In file included from include/linux/pseudo_fs.h:4:
   In file included from include/linux/fs_context.h:14:
   In file included from include/linux/security.h:33:
   In file included from include/linux/mm.h:2213:
   include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
>> kernel/resource.c:1183:36: warning: declaration of 'union acpi_subtable_headers' will not be visible outside of this function [-Wvisibility]
    1183 | static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
         |                                    ^
>> kernel/resource.c:1212:2: error: call to undeclared function 'acpi_table_parse_cedt'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    1212 |         acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
         |         ^
   kernel/resource.c:1212:2: note: did you mean 'acpi_table_parse'?
   include/linux/acpi.h:914:19: note: 'acpi_table_parse' declared here
     914 | static inline int acpi_table_parse(char *id,
         |                   ^
   2 warnings and 1 error generated.


vim +/acpi_table_parse_cedt +1212 kernel/resource.c

  1182	
> 1183	static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
  1184				     void *arg, const unsigned long unused)
  1185	{
  1186		struct acpi_cedt_cfmws *cfmws;
  1187		struct srmem_arg *args = arg;
  1188		struct resource cfmws_res;
  1189		struct resource *res;
  1190	
  1191		res = args->res;
  1192	
  1193		cfmws = (struct acpi_cedt_cfmws *)hdr;
  1194		cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
  1195					   cfmws->base_hpa + cfmws->window_size);
  1196	
  1197		if (resource_overlaps(&cfmws_res, res)) {
  1198			args->overlaps += 1;
  1199			return 1;
  1200		}
  1201	
  1202		return 0;
  1203	}
  1204	
  1205	static bool resource_overlaps_cfmws(struct resource *res)
  1206	{
  1207		struct srmem_arg arg = {
  1208			.res = res,
  1209			.overlaps = 0
  1210		};
  1211	
> 1212		acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
  1213	
  1214		if (arg.overlaps)
  1215			return true;
  1216	
  1217		return false;
  1218	}
  1219
kernel test robot Dec. 3, 2024, 1:12 a.m. UTC | #4
Hi Nathan,

kernel test robot noticed the following build warnings:

[auto build test WARNING on cxl/next]
[also build test WARNING on tip/x86/core]
[cannot apply to linus/master cxl/pending v6.13-rc1 next-20241128]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Nathan-Fontenot/cxl-Update-Soft-Reserved-resources-upon-region-creation/20241202-235757
base:   https://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git next
patch link:    https://lore.kernel.org/r/20241202155542.22111-1-nathan.fontenot%40amd.com
patch subject: [PATCH] cxl: Update Soft Reserved resources upon region creation
config: arm64-randconfig-001-20241203 (https://download.01.org/0day-ci/archive/20241203/202412030840.56S2Dv8L-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241203/202412030840.56S2Dv8L-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202412030840.56S2Dv8L-lkp@intel.com/

All warnings (new ones prefixed by >>, old ones prefixed by <<):

>> WARNING: modpost: vmlinux: section mismatch in reference: insert_soft_reserve_resource+0x90 (section: .text.unlikely) -> acpi_table_parse_cedt (section: .init.text)
Fontenot, Nathan Dec. 4, 2024, 4:33 p.m. UTC | #5
On 12/2/2024 12:56 PM, Fan Ni wrote:
> On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
>> Update handling of SOFT RESERVE iomem resources that intersect with
>> CXL region resources to remove the intersections from the SOFT RESERVE
>> resources. The current approach of leaving the SOFT RESERVE
>> resource as is can cause failures during hotplug replace of CXL
>> devices because the resource is not available for reuse after
>> teardown of the CXL device.
>>
>> The approach is to trim out any pieces of SOFT RESERVE resources
>> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
>> resources that intersect with a CFMWS into a separate resource tree
>> during e820__reserve_resources_late() that would have been otherwise
>> added to the iomem resource tree.
>>
>> As CXL regions are created the cxl resource created for the new
>> region is used to trim intersections from the SOFT RESERVE
>> resources that were previously set aside.
>>
>> Once CXL device probe has completed ant remaining SOFT RESERVE resources
> /ant/any/
>> remaining are added to the iomem resource tree. As each resource
>> is added to the oiomem resource tree a new notifier chain is invoked
> /oiomem/iomem/

I'll fix spelling mistakes for next version.

>> to notify the dax driver of newly added SOFT RESERVE resources so that
>> the dax driver can consume them.
> 
> In general, the patch is kind of complicated and hard to review for me.
> I am wondering if it can be broken down to make it easier to
> review.

I had thought about possibly breaking this into a few patches but wanted to get it
out for review. I'll split it up for the next version.

> 
> One minor thing inline.
> 
>>
>> Signed-off-by: Nathan Fontenot <nathan.fontenot@amd.com>
>> ---
>>  arch/x86/kernel/e820.c    |  17 ++++-
>>  drivers/cxl/core/region.c |   8 +-
>>  drivers/cxl/port.c        |  15 ++++
>>  drivers/dax/hmem/device.c |  13 ++--
>>  drivers/dax/hmem/hmem.c   |  15 ++++
>>  drivers/dax/hmem/hmem.h   |  11 +++
>>  include/linux/dax.h       |   4 -
>>  include/linux/ioport.h    |   6 ++
>>  kernel/resource.c         | 155 +++++++++++++++++++++++++++++++++++++-
>>  9 files changed, 229 insertions(+), 15 deletions(-)
>>  create mode 100644 drivers/dax/hmem/hmem.h
>>
>> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
>> index 4893d30ce438..cab82e9324a5 100644
>> --- a/arch/x86/kernel/e820.c
>> +++ b/arch/x86/kernel/e820.c
>> @@ -1210,14 +1210,23 @@ static unsigned long __init ram_alignment(resource_size_t pos)
>>  
>>  void __init e820__reserve_resources_late(void)
>>  {
>> -	int i;
>>  	struct resource *res;
>> +	int i;
>>  
>> +	/*
>> +	 * Prior to inserting SOFT_RESERVED resources we want to check for an
>> +	 * intersection with potential CXL resources. Any SOFT_RESERVED resources
>> +	 * that do intersect a potential CXL resource are set aside so they
>> +	 * can be trimmed to accommodate CXL resource intersections and added to
>> +	 * the iomem resource tree after the CXL drivers have completed their
>> +	 * device probe.
>> +	 */
>>  	res = e820_res;
>> -	for (i = 0; i < e820_table->nr_entries; i++) {
>> -		if (!res->parent && res->end)
>> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {
>> +		if (res->desc == IORES_DESC_SOFT_RESERVED)
>> +			insert_soft_reserve_resource(res);
>> +		else if (!res->parent && res->end)
>>  			insert_resource_expand_to_fit(&iomem_resource, res);
>> -		res++;
> Maybe we can keep the original (res++) here to avoid the noise since it is a
> style thing and does not affect what we want to achieve. 

Yes, I can make that change.

-Nathan

> 
> Fan
>>  	}
>>  
>>  	/*
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 21ad5f242875..c458a6313b31 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -3226,6 +3226,12 @@ static int match_region_by_range(struct device *dev, void *data)
>>  	return rc;
>>  }
>>  
>> +static int insert_region_resource(struct resource *parent, struct resource *res)
>> +{
>> +	trim_soft_reserve_resources(res);
>> +	return insert_resource(parent, res);
>> +}
>> +
>>  /* Establish an empty region covering the given HPA range */
>>  static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>>  					   struct cxl_endpoint_decoder *cxled)
>> @@ -3272,7 +3278,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>>  
>>  	*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
>>  				    dev_name(&cxlr->dev));
>> -	rc = insert_resource(cxlrd->res, res);
>> +	rc = insert_region_resource(cxlrd->res, res);
>>  	if (rc) {
>>  		/*
>>  		 * Platform-firmware may not have split resources like "System
>> diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
>> index d7d5d982ce69..4461f2a80d72 100644
>> --- a/drivers/cxl/port.c
>> +++ b/drivers/cxl/port.c
>> @@ -89,6 +89,20 @@ static int cxl_switch_port_probe(struct cxl_port *port)
>>  	return -ENXIO;
>>  }
>>  
>> +static void cxl_sr_update(struct work_struct *w)
>> +{
>> +	merge_soft_reserve_resources();
>> +}
>> +
>> +DECLARE_DELAYED_WORK(cxl_sr_work, cxl_sr_update);
>> +
>> +static void schedule_soft_reserve_update(void)
>> +{
>> +	int timeout = 5 * HZ;
>> +
>> +	mod_delayed_work(system_wq, &cxl_sr_work, timeout);
>> +}
>> +
>>  static int cxl_endpoint_port_probe(struct cxl_port *port)
>>  {
>>  	struct cxl_endpoint_dvsec_info info = { .port = port };
>> @@ -140,6 +154,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
>>  	 */
>>  	device_for_each_child(&port->dev, root, discover_region);
>>  
>> +	schedule_soft_reserve_update();
>>  	return 0;
>>  }
>>  
>> diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
>> index f9e1a76a04a9..c45791ad4858 100644
>> --- a/drivers/dax/hmem/device.c
>> +++ b/drivers/dax/hmem/device.c
>> @@ -4,6 +4,7 @@
>>  #include <linux/module.h>
>>  #include <linux/dax.h>
>>  #include <linux/mm.h>
>> +#include "hmem.h"
>>  
>>  static bool nohmem;
>>  module_param_named(disable, nohmem, bool, 0444);
>> @@ -17,6 +18,9 @@ static struct resource hmem_active = {
>>  	.flags = IORESOURCE_MEM,
>>  };
>>  
>> +struct platform_device *hmem_pdev;
>> +EXPORT_SYMBOL_GPL(hmem_pdev);
>> +
>>  int walk_hmem_resources(struct device *host, walk_hmem_fn fn)
>>  {
>>  	struct resource *res;
>> @@ -35,7 +39,6 @@ EXPORT_SYMBOL_GPL(walk_hmem_resources);
>>  
>>  static void __hmem_register_resource(int target_nid, struct resource *res)
>>  {
>> -	struct platform_device *pdev;
>>  	struct resource *new;
>>  	int rc;
>>  
>> @@ -51,15 +54,15 @@ static void __hmem_register_resource(int target_nid, struct resource *res)
>>  	if (platform_initialized)
>>  		return;
>>  
>> -	pdev = platform_device_alloc("hmem_platform", 0);
>> -	if (!pdev) {
>> +	hmem_pdev = platform_device_alloc("hmem_platform", 0);
>> +	if (!hmem_pdev) {
>>  		pr_err_once("failed to register device-dax hmem_platform device\n");
>>  		return;
>>  	}
>>  
>> -	rc = platform_device_add(pdev);
>> +	rc = platform_device_add(hmem_pdev);
>>  	if (rc)
>> -		platform_device_put(pdev);
>> +		platform_device_put(hmem_pdev);
>>  	else
>>  		platform_initialized = true;
>>  }
>> diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
>> index 5e7c53f18491..d626b60a9716 100644
>> --- a/drivers/dax/hmem/hmem.c
>> +++ b/drivers/dax/hmem/hmem.c
>> @@ -5,6 +5,7 @@
>>  #include <linux/pfn_t.h>
>>  #include <linux/dax.h>
>>  #include "../bus.h"
>> +#include "hmem.h"
>>  
>>  static bool region_idle;
>>  module_param_named(region_idle, region_idle, bool, 0644);
>> @@ -123,8 +124,22 @@ static int hmem_register_device(struct device *host, int target_nid,
>>  	return rc;
>>  }
>>  
>> +static int dax_hmem_cb(struct notifier_block *nb, unsigned long action,
>> +		       void *arg)
>> +{
>> +	struct resource *res = arg;
>> +
>> +	return hmem_register_device(&hmem_pdev->dev,
>> +				    phys_to_target_node(res->start), res);
>> +}
>> +
>> +static struct notifier_block hmem_nb = {
>> +	.notifier_call = dax_hmem_cb
>> +};
>> +
>>  static int dax_hmem_platform_probe(struct platform_device *pdev)
>>  {
>> +	register_soft_reserve_notifier(&hmem_nb);
>>  	return walk_hmem_resources(&pdev->dev, hmem_register_device);
>>  }
>>  
>> diff --git a/drivers/dax/hmem/hmem.h b/drivers/dax/hmem/hmem.h
>> new file mode 100644
>> index 000000000000..95583b59cef7
>> --- /dev/null
>> +++ b/drivers/dax/hmem/hmem.h
>> @@ -0,0 +1,11 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#ifndef _HMEM_H
>> +#define _HMEM_H
>> +
>> +typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
>> +			    const struct resource *res);
>> +int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
>> +
>> +extern struct platform_device *hmem_pdev;
>> +
>> +#endif
>> diff --git a/include/linux/dax.h b/include/linux/dax.h
>> index 9d3e3327af4c..119b4e27a592 100644
>> --- a/include/linux/dax.h
>> +++ b/include/linux/dax.h
>> @@ -282,8 +282,4 @@ static inline void hmem_register_resource(int target_nid, struct resource *r)
>>  {
>>  }
>>  #endif
>> -
>> -typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
>> -			    const struct resource *res);
>> -int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
>>  #endif
>> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>> index 6e9fb667a1c5..487371a46392 100644
>> --- a/include/linux/ioport.h
>> +++ b/include/linux/ioport.h
>> @@ -14,6 +14,7 @@
>>  #include <linux/compiler.h>
>>  #include <linux/minmax.h>
>>  #include <linux/types.h>
>> +#include <linux/notifier.h>
>>  /*
>>   * Resources are tree-like, allowing
>>   * nesting etc..
>> @@ -249,6 +250,11 @@ struct resource *lookup_resource(struct resource *root, resource_size_t start);
>>  int adjust_resource(struct resource *res, resource_size_t start,
>>  		    resource_size_t size);
>>  resource_size_t resource_alignment(struct resource *res);
>> +extern void trim_soft_reserve_resources(const struct resource *res);
>> +extern void merge_soft_reserve_resources(void);
>> +extern int insert_soft_reserve_resource(struct resource *res);
>> +extern int register_soft_reserve_notifier(struct notifier_block *nb);
>> +extern int unregister_soft_reserve_notifier(struct notifier_block *nb);
>>  static inline resource_size_t resource_size(const struct resource *res)
>>  {
>>  	return res->end - res->start + 1;
>> diff --git a/kernel/resource.c b/kernel/resource.c
>> index a83040fde236..8fc4121a1887 100644
>> --- a/kernel/resource.c
>> +++ b/kernel/resource.c
>> @@ -30,7 +30,7 @@
>>  #include <linux/string.h>
>>  #include <linux/vmalloc.h>
>>  #include <asm/io.h>
>> -
>> +#include <linux/acpi.h>
>>  
>>  struct resource ioport_resource = {
>>  	.name	= "PCI IO",
>> @@ -48,7 +48,15 @@ struct resource iomem_resource = {
>>  };
>>  EXPORT_SYMBOL(iomem_resource);
>>  
>> +struct resource srmem_resource = {
>> +	.name	= "Soft Reserved mem",
>> +	.start	= 0,
>> +	.end	= -1,
>> +	.flags	= IORESOURCE_MEM,
>> +};
>> +
>>  static DEFINE_RWLOCK(resource_lock);
>> +static DEFINE_RWLOCK(srmem_resource_lock);
>>  
>>  static struct resource *next_resource(struct resource *p, bool skip_children)
>>  {
>> @@ -1034,6 +1042,151 @@ int adjust_resource(struct resource *res, resource_size_t start,
>>  }
>>  EXPORT_SYMBOL(adjust_resource);
>>  
>> +static BLOCKING_NOTIFIER_HEAD(soft_reserve_chain);
>> +
>> +int register_soft_reserve_notifier(struct notifier_block *nb)
>> +{
>> +	return blocking_notifier_chain_register(&soft_reserve_chain, nb);
>> +}
>> +EXPORT_SYMBOL(register_soft_reserve_notifier);
>> +
>> +int unregister_soft_reserve_notifier(struct notifier_block *nb)
>> +{
>> +	return blocking_notifier_chain_unregister(&soft_reserve_chain, nb);
>> +}
>> +EXPORT_SYMBOL(unregister_soft_reserve_notifier);
>> +
>> +static int soft_reserve_notify(unsigned long val, void *v)
>> +{
>> +	struct resource *res = v;
>> +
>> +	pr_info("Adding Soft Reserve resource %pr\n", res);
>> +	return blocking_notifier_call_chain(&soft_reserve_chain, val, v);
>> +}
>> +
>> +static void trim_soft_reserve(struct resource *sr_res,
>> +			      const struct resource *res)
>> +{
>> +	struct resource *new_res;
>> +
>> +	if (sr_res->start == res->start && sr_res->end == res->end) {
>> +		release_resource(sr_res);
>> +		free_resource(sr_res);
>> +	} else if (sr_res->start == res->start) {
>> +		WARN_ON(adjust_resource(sr_res, res->end + 1,
>> +					sr_res->end - res->end));
>> +	} else if (sr_res->end == res->end) {
>> +		WARN_ON(adjust_resource(sr_res, sr_res->start,
>> +					res->start - sr_res->start));
>> +	} else {
>> +		/*
>> +		 * Adjust existing resource to cover the resource
>> +		 * range prior to the range to be trimmed.
>> +		 */
>> +		adjust_resource(sr_res, sr_res->start,
>> +				res->start - sr_res->start);
>> +
>> +		/*
>> +		 * Add new resource to cover the resource range for
>> +		 * the range after the range to be trimmed.
>> +		 */
>> +		new_res = alloc_resource(GFP_KERNEL);
>> +		if (!new_res)
>> +			return;
>> +
>> +		*new_res = DEFINE_RES_NAMED(res->end + 1, sr_res->end - res->end,
>> +					    "Soft Reserved", sr_res->flags);
>> +		new_res->desc = IORES_DESC_SOFT_RESERVED;
>> +		insert_resource(&srmem_resource, new_res);
>> +	}
>> +}
>> +
>> +void trim_soft_reserve_resources(const struct resource *res)
>> +{
>> +	struct resource *sr_res;
>> +
>> +	write_lock(&srmem_resource_lock);
>> +	for (sr_res = srmem_resource.child; sr_res; sr_res = sr_res->sibling) {
>> +		if (resource_contains(sr_res, res)) {
>> +			trim_soft_reserve(sr_res, res);
>> +			break;
>> +		}
>> +	}
>> +	write_unlock(&srmem_resource_lock);
>> +}
>> +EXPORT_SYMBOL(trim_soft_reserve_resources);
>> +
>> +void merge_soft_reserve_resources(void)
>> +{
>> +	struct resource *sr_res, *next;
>> +
>> +	write_lock(&srmem_resource_lock);
>> +	for (sr_res = srmem_resource.child; sr_res; sr_res = next) {
>> +		next = sr_res->sibling;
>> +
>> +		release_resource(sr_res);
>> +		if (insert_resource(&iomem_resource, sr_res))
>> +			pr_info("Could not add Soft Reserve %pr\n", sr_res);
>> +		else
>> +			soft_reserve_notify(0, sr_res);
>> +	}
>> +	write_unlock(&srmem_resource_lock);
>> +}
>> +EXPORT_SYMBOL(merge_soft_reserve_resources);
>> +
>> +struct srmem_arg {
>> +	struct resource *res;
>> +	int overlaps;
>> +};
>> +
>> +static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
>> +			     void *arg, const unsigned long unused)
>> +{
>> +	struct acpi_cedt_cfmws *cfmws;
>> +	struct srmem_arg *args = arg;
>> +	struct resource cfmws_res;
>> +	struct resource *res;
>> +
>> +	res = args->res;
>> +
>> +	cfmws = (struct acpi_cedt_cfmws *)hdr;
>> +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
>> +				   cfmws->base_hpa + cfmws->window_size);
>> +
>> +	if (resource_overlaps(&cfmws_res, res)) {
>> +		args->overlaps += 1;
>> +		return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static bool resource_overlaps_cfmws(struct resource *res)
>> +{
>> +	struct srmem_arg arg = {
>> +		.res = res,
>> +		.overlaps = 0
>> +	};
>> +
>> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
>> +
>> +	if (arg.overlaps)
>> +		return true;
>> +
>> +	return false;
>> +}
>> +
>> +int insert_soft_reserve_resource(struct resource *res)
>> +{
>> +	if (resource_overlaps_cfmws(res)) {
>> +		pr_info("Reserving Soft Reserve %pr\n", res);
>> +		return insert_resource(&srmem_resource, res);
>> +	}
>> +
>> +	return insert_resource(&iomem_resource, res);
>> +}
>> +EXPORT_SYMBOL(insert_soft_reserve_resource);
>> +
>>  static void __init
>>  __reserve_region_with_split(struct resource *root, resource_size_t start,
>>  			    resource_size_t end, const char *name)
>> -- 
>> 2.43.0
>>
>
Andy Shevchenko Dec. 11, 2024, 11:31 a.m. UTC | #6
On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
> Update handling of SOFT RESERVE iomem resources that intersect with
> CXL region resources to remove the intersections from the SOFT RESERVE
> resources. The current approach of leaving the SOFT RESERVE
> resource as is can cause failures during hotplug replace of CXL
> devices because the resource is not available for reuse after
> teardown of the CXL device.
> 
> The approach is to trim out any pieces of SOFT RESERVE resources
> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
> resources that intersect with a CFMWS into a separate resource tree
> during e820__reserve_resources_late() that would have been otherwise
> added to the iomem resource tree.
> 
> As CXL regions are created the cxl resource created for the new
> region is used to trim intersections from the SOFT RESERVE
> resources that were previously set aside.
> 
> Once CXL device probe has completed ant remaining SOFT RESERVE resources
> remaining are added to the iomem resource tree. As each resource
> is added to the oiomem resource tree a new notifier chain is invoked
> to notify the dax driver of newly added SOFT RESERVE resources so that
> the dax driver can consume them.

...

>  void __init e820__reserve_resources_late(void)
>  {
> -	int i;
>  	struct resource *res;
> +	int i;

Unrelated change.

...

> -	for (i = 0; i < e820_table->nr_entries; i++) {

> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {


> -		res++;

Unrelated change.


>  	}

...

> +static struct notifier_block hmem_nb = {
> +	.notifier_call = dax_hmem_cb

It's better to leave trailing comma as it reduces churn in the future is
anything to add here.

> +};

...

> +++ b/drivers/dax/hmem/hmem.h
> @@ -0,0 +1,11 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#ifndef _HMEM_H
> +#define _HMEM_H

This needs a few forward declarations

struct device;
struct platform_device;
struct resource;

> +typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
> +			    const struct resource *res);
> +int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
> +
> +extern struct platform_device *hmem_pdev;
> +
> +#endif

...

>  #include <linux/compiler.h>
>  #include <linux/minmax.h>
>  #include <linux/types.h>
> +#include <linux/notifier.h>

I would put it before types.h to have more ordered piece.

...

> +extern void trim_soft_reserve_resources(const struct resource *res);
> +extern void merge_soft_reserve_resources(void);
> +extern int insert_soft_reserve_resource(struct resource *res);
> +extern int register_soft_reserve_notifier(struct notifier_block *nb);
> +extern int unregister_soft_reserve_notifier(struct notifier_block *nb);

Why extern?

...

> +++ b/kernel/resource.c
> @@ -30,7 +30,7 @@
>  #include <linux/string.h>
>  #include <linux/vmalloc.h>
>  #include <asm/io.h>
> -
> +#include <linux/acpi.h>

We don't usually interleave linux and asm headers, moreover the list seems to
be sorted (ordered, please preserve the ordering).

...

> +struct resource srmem_resource = {
> +	.name	= "Soft Reserved mem",
> +	.start	= 0,
> +	.end	= -1,
> +	.flags	= IORESOURCE_MEM,

This can use DEFINE_RES_MEM_NAMED() as well.

> +};

...

> +	if (sr_res->start == res->start && sr_res->end == res->end) {

Wondering if we have a helper to exact match the resource by range...

> +		release_resource(sr_res);
> +		free_resource(sr_res);
> +	} else if (sr_res->start == res->start) {
> +		WARN_ON(adjust_resource(sr_res, res->end + 1,
> +					sr_res->end - res->end));
> +	} else if (sr_res->end == res->end) {
> +		WARN_ON(adjust_resource(sr_res, sr_res->start,
> +					res->start - sr_res->start));
> +	} else {
> +		/*
> +		 * Adjust existing resource to cover the resource
> +		 * range prior to the range to be trimmed.
> +		 */
> +		adjust_resource(sr_res, sr_res->start,
> +				res->start - sr_res->start);
> +
> +		/*
> +		 * Add new resource to cover the resource range for
> +		 * the range after the range to be trimmed.
> +		 */
> +		new_res = alloc_resource(GFP_KERNEL);
> +		if (!new_res)
> +			return;
> +
> +		*new_res = DEFINE_RES_NAMED(res->end + 1, sr_res->end - res->end,
> +					    "Soft Reserved", sr_res->flags);
> +		new_res->desc = IORES_DESC_SOFT_RESERVED;
> +		insert_resource(&srmem_resource, new_res);
> +	}

...

> +void trim_soft_reserve_resources(const struct resource *res)
> +{
> +	struct resource *sr_res;
> +
> +	write_lock(&srmem_resource_lock);
> +	for (sr_res = srmem_resource.child; sr_res; sr_res = sr_res->sibling) {

Can this utilise for_each_resource*()?
Ditto for the rest of open coded for each type of loops.

> +		if (resource_contains(sr_res, res)) {
> +			trim_soft_reserve(sr_res, res);
> +			break;
> +		}
> +	}
> +	write_unlock(&srmem_resource_lock);
> +}

...

> +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
> +				   cfmws->base_hpa + cfmws->window_size);

Can be one line.
But is this correct? The parameters are start,size, and here it seems like start,end.

...

> +static bool resource_overlaps_cfmws(struct resource *res)
> +{
> +	struct srmem_arg arg = {
> +		.res = res,

> +		.overlaps = 0
Keep trailing comma.

> +	};
> +
> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);

> +	if (arg.overlaps)
> +		return true;
> +
> +	return false;

	return arg.overlaps;

> +}

...

> +int insert_soft_reserve_resource(struct resource *res)
> +{
> +	if (resource_overlaps_cfmws(res)) {
> +		pr_info("Reserving Soft Reserve %pr\n", res);

Btw, do we have pr_fmt() defined in this file?

> +		return insert_resource(&srmem_resource, res);
> +	}
> +
> +	return insert_resource(&iomem_resource, res);
> +}
Fontenot, Nathan Dec. 11, 2024, 8:11 p.m. UTC | #7
responses below...

On 12/11/2024 5:31 AM, Andy Shevchenko wrote:
> On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
>> Update handling of SOFT RESERVE iomem resources that intersect with
>> CXL region resources to remove the intersections from the SOFT RESERVE
>> resources. The current approach of leaving the SOFT RESERVE
>> resource as is can cause failures during hotplug replace of CXL
>> devices because the resource is not available for reuse after
>> teardown of the CXL device.
>>
>> The approach is to trim out any pieces of SOFT RESERVE resources
>> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
>> resources that intersect with a CFMWS into a separate resource tree
>> during e820__reserve_resources_late() that would have been otherwise
>> added to the iomem resource tree.
>>
>> As CXL regions are created the cxl resource created for the new
>> region is used to trim intersections from the SOFT RESERVE
>> resources that were previously set aside.
>>
>> Once CXL device probe has completed ant remaining SOFT RESERVE resources
>> remaining are added to the iomem resource tree. As each resource
>> is added to the oiomem resource tree a new notifier chain is invoked
>> to notify the dax driver of newly added SOFT RESERVE resources so that
>> the dax driver can consume them.
> 
> ...
> 
>>  void __init e820__reserve_resources_late(void)
>>  {
>> -	int i;
>>  	struct resource *res;
>> +	int i;
> 
> Unrelated change.
> 
> ...
> 
>> -	for (i = 0; i < e820_table->nr_entries; i++) {
> 
>> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {
> 
> 
>> -		res++;
> 
> Unrelated change.

I can remove these unrelated changes.

> 
> 
>>  	}
> 
> ...
> 
>> +static struct notifier_block hmem_nb = {
>> +	.notifier_call = dax_hmem_cb
> 
> It's better to leave trailing comma as it reduces churn in the future is
> anything to add here.
> 
>> +};
> 
> ...
> 
>> +++ b/drivers/dax/hmem/hmem.h
>> @@ -0,0 +1,11 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#ifndef _HMEM_H
>> +#define _HMEM_H
> 
> This needs a few forward declarations
> 
> struct device;
> struct platform_device;
> struct resource;

The next version of the patch removes this newly created .h file but good point
for any new headers created.
 
> 
>> +typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
>> +			    const struct resource *res);
>> +int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
>> +
>> +extern struct platform_device *hmem_pdev;
>> +
>> +#endif
> 
> ...
> 
>>  #include <linux/compiler.h>
>>  #include <linux/minmax.h>
>>  #include <linux/types.h>
>> +#include <linux/notifier.h>
> 
> I would put it before types.h to have more ordered piece.

will do.

> 
> ...
> 
>> +extern void trim_soft_reserve_resources(const struct resource *res);
>> +extern void merge_soft_reserve_resources(void);
>> +extern int insert_soft_reserve_resource(struct resource *res);
>> +extern int register_soft_reserve_notifier(struct notifier_block *nb);
>> +extern int unregister_soft_reserve_notifier(struct notifier_block *nb);
> 
> Why extern?

You're correct, extern isn't needed. I used it to follow what is done for other
declarations in the file only.

> 
> ...
> 
>> +++ b/kernel/resource.c
>> @@ -30,7 +30,7 @@
>>  #include <linux/string.h>
>>  #include <linux/vmalloc.h>
>>  #include <asm/io.h>
>> -
>> +#include <linux/acpi.h>
> 
> We don't usually interleave linux and asm headers, moreover the list seems to
> be sorted (ordered, please preserve the ordering).
> 

Good point, I'll correct this.

> ...
> 
>> +struct resource srmem_resource = {
>> +	.name	= "Soft Reserved mem",
>> +	.start	= 0,
>> +	.end	= -1,
>> +	.flags	= IORESOURCE_MEM,
> 
> This can use DEFINE_RES_MEM_NAMED() as well.

Yes. I went with this format since it is what is used for the other two
struct resource definitions currently in the file.

> 
>> +};
> 
> ...
> 
>> +	if (sr_res->start == res->start && sr_res->end == res->end) {
> 
> Wondering if we have a helper to exact match the resource by range...

I don't remember seeing one but will investigate.

> 
>> +		release_resource(sr_res);
>> +		free_resource(sr_res);
>> +	} else if (sr_res->start == res->start) {
>> +		WARN_ON(adjust_resource(sr_res, res->end + 1,
>> +					sr_res->end - res->end));
>> +	} else if (sr_res->end == res->end) {
>> +		WARN_ON(adjust_resource(sr_res, sr_res->start,
>> +					res->start - sr_res->start));
>> +	} else {
>> +		/*
>> +		 * Adjust existing resource to cover the resource
>> +		 * range prior to the range to be trimmed.
>> +		 */
>> +		adjust_resource(sr_res, sr_res->start,
>> +				res->start - sr_res->start);
>> +
>> +		/*
>> +		 * Add new resource to cover the resource range for
>> +		 * the range after the range to be trimmed.
>> +		 */
>> +		new_res = alloc_resource(GFP_KERNEL);
>> +		if (!new_res)
>> +			return;
>> +
>> +		*new_res = DEFINE_RES_NAMED(res->end + 1, sr_res->end - res->end,
>> +					    "Soft Reserved", sr_res->flags);
>> +		new_res->desc = IORES_DESC_SOFT_RESERVED;
>> +		insert_resource(&srmem_resource, new_res);
>> +	}
> 
> ...
> 
>> +void trim_soft_reserve_resources(const struct resource *res)
>> +{
>> +	struct resource *sr_res;
>> +
>> +	write_lock(&srmem_resource_lock);
>> +	for (sr_res = srmem_resource.child; sr_res; sr_res = sr_res->sibling) {
> 
> Can this utilise for_each_resource*()?
> Ditto for the rest of open coded for each type of loops.

This one could use for_each_resource(). There is one loop that cannot since it
moves struct resources from one list to another.

> 
>> +		if (resource_contains(sr_res, res)) {
>> +			trim_soft_reserve(sr_res, res);
>> +			break;
>> +		}
>> +	}
>> +	write_unlock(&srmem_resource_lock);
>> +}
> 
> ...
> 
>> +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
>> +				   cfmws->base_hpa + cfmws->window_size);
> 
> Can be one line.
> But is this correct? The parameters are start,size, and here it seems like start,end.

Good catch. I'll fix this.

> 
> ...
> 
>> +static bool resource_overlaps_cfmws(struct resource *res)
>> +{
>> +	struct srmem_arg arg = {
>> +		.res = res,
> 
>> +		.overlaps = 0
> Keep trailing comma.

will do.

> 
>> +	};
>> +
>> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
> 
>> +	if (arg.overlaps)
>> +		return true;
>> +
>> +	return false;
> 
> 	return arg.overlaps;
> 
>> +}
> 
> ...
> 
>> +int insert_soft_reserve_resource(struct resource *res)
>> +{
>> +	if (resource_overlaps_cfmws(res)) {
>> +		pr_info("Reserving Soft Reserve %pr\n", res);
> 
> Btw, do we have pr_fmt() defined in this file?

Yes, there is a pr_fmt for kernel/resource.c

Thanks for the review.

-Nathan

> 
>> +		return insert_resource(&srmem_resource, res);
>> +	}
>> +
>> +	return insert_resource(&iomem_resource, res);
>> +}
>
Dan Williams Dec. 11, 2024, 10:30 p.m. UTC | #8
Nathan Fontenot wrote:
> Update handling of SOFT RESERVE iomem resources that intersect with
> CXL region resources to remove the intersections from the SOFT RESERVE
> resources. The current approach of leaving the SOFT RESERVE
> resource as is can cause failures during hotplug replace of CXL
> devices because the resource is not available for reuse after
> teardown of the CXL device.
> 
> The approach is to trim out any pieces of SOFT RESERVE resources
> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
> resources that intersect with a CFMWS into a separate resource tree
> during e820__reserve_resources_late() that would have been otherwise
> added to the iomem resource tree.
> 
> As CXL regions are created the cxl resource created for the new
> region is used to trim intersections from the SOFT RESERVE
> resources that were previously set aside.
> 
> Once CXL device probe has completed ant remaining SOFT RESERVE resources
> remaining are added to the iomem resource tree. As each resource
> is added to the oiomem resource tree a new notifier chain is invoked
> to notify the dax driver of newly added SOFT RESERVE resources so that
> the dax driver can consume them.

Hi Nathan, this patch hit on all the mechanisms I would expect, but upon
reading it there is an opportunity to zoom out and do something blunter
than the surgical precision of this current proposal.

In other words, I appreciate the consideration of potential corner
cases, but for overall maintainability this should aim to be an all or
nothing approach.

Specifically, at the first sign of trouble, any CXL sub-driver probe
failure or region enumeration timeout, that the entire CXL topology be
torn down (trigger the equivalent of ->remove() on the ACPI0017 device),
and the deferred Soft Reserved ranges registered as if cxl_acpi was not
present (implement a fallback equivalent to hmem_register_devices()).

No need to trim resources as regions arrive, just tear down everything
setup in the cxl_acpi_probe() path with devres_release_all().

So, I am thinking export a flag from the CXL core that indicates whether
any conflict with platform-firmware established CXL regions has
occurred.

Read that flag from an cxl_acpi-driver-launched deferred workqueue that
is awaiting initial device probing to quiesce. If that flag indicates a
CXL enumeration failure then trigger devres_release_all() on the
ACPI0017 platform device and follow that up by walking the deferred Soft
Reserve resources to register raw (unparented by CXL regions) dax
devices.

Some more comments below:

> Signed-off-by: Nathan Fontenot <nathan.fontenot@amd.com>
> ---
>  arch/x86/kernel/e820.c    |  17 ++++-
>  drivers/cxl/core/region.c |   8 +-
>  drivers/cxl/port.c        |  15 ++++
>  drivers/dax/hmem/device.c |  13 ++--
>  drivers/dax/hmem/hmem.c   |  15 ++++
>  drivers/dax/hmem/hmem.h   |  11 +++
>  include/linux/dax.h       |   4 -
>  include/linux/ioport.h    |   6 ++
>  kernel/resource.c         | 155 +++++++++++++++++++++++++++++++++++++-
>  9 files changed, 229 insertions(+), 15 deletions(-)
>  create mode 100644 drivers/dax/hmem/hmem.h
> 
> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> index 4893d30ce438..cab82e9324a5 100644
> --- a/arch/x86/kernel/e820.c
> +++ b/arch/x86/kernel/e820.c
> @@ -1210,14 +1210,23 @@ static unsigned long __init ram_alignment(resource_size_t pos)
>  
>  void __init e820__reserve_resources_late(void)
>  {
> -	int i;
>  	struct resource *res;
> +	int i;
>  
> +	/*
> +	 * Prior to inserting SOFT_RESERVED resources we want to check for an
> +	 * intersection with potential CXL resources. Any SOFT_RESERVED resources
> +	 * that do intersect a potential CXL resource are set aside so they
> +	 * can be trimmed to accommodate CXL resource intersections and added to
> +	 * the iomem resource tree after the CXL drivers have completed their
> +	 * device probe.

Perhaps shorten to "see hmem_register_devices() and cxl_acpi_probe() for
deferred initialization of Soft Reserved ranges"

> +	 */
>  	res = e820_res;
> -	for (i = 0; i < e820_table->nr_entries; i++) {
> -		if (!res->parent && res->end)
> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {
> +		if (res->desc == IORES_DESC_SOFT_RESERVED)
> +			insert_soft_reserve_resource(res);

I would only expect this deferral to happen when CONFIG_DEV_DAX_HMEM
and/or CONFIG_CXL_REGION  is enabled. It also needs to catch Soft
Reserved deferral on other, non-e820 based, archs. So, maybe this hackery
should be done internal to insert_resource_*(). Something like all
insert_resource() of IORES_DESC_SOFT_RESERVED is deferred until a flag
is flipped allowing future insertion attempts to succeed in adding them
to the ioresource_mem tree.

Not that I expect this problem will ever effect more than just CXL, but
it is already the case that Soft Reserved is set for more than just CXL
ranges, and who know what other backend Soft Reserved consumer drivers
might arrive later.

When CXL or HMEM parses the deferred entries they can take
responsibility for injecting the Soft Reserved entries. That achieves
continuity of the /proc/iomem contents across kernel versions while
giving those endpoint drivers the ability to unregister those resources.

> +		else if (!res->parent && res->end)
>  			insert_resource_expand_to_fit(&iomem_resource, res);
> -		res++;
>  	}
>  
>  	/*
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 21ad5f242875..c458a6313b31 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -3226,6 +3226,12 @@ static int match_region_by_range(struct device *dev, void *data)
>  	return rc;
>  }
>  
> +static int insert_region_resource(struct resource *parent, struct resource *res)
> +{
> +	trim_soft_reserve_resources(res);
> +	return insert_resource(parent, res);
> +}

Per above, lets not do dynamic trimming, it's all or nothing CXL memory
enumeration if the driver is trying and failing to parse any part of the
BIOS-established CXL configuration.

Yes, this could result in regressions in the other direction, but my
expectation is that the vast majority of CXL memory present at boot is
meant to be indistinguishable from DDR. In other words the current
default of "lose access to memory upon CXL enumeration failure that is
otherwise fully described by the EFI Memory Map" is the wrong default
policy.

> +
>  /* Establish an empty region covering the given HPA range */
>  static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>  					   struct cxl_endpoint_decoder *cxled)
> @@ -3272,7 +3278,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>  
>  	*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
>  				    dev_name(&cxlr->dev));
> -	rc = insert_resource(cxlrd->res, res);
> +	rc = insert_region_resource(cxlrd->res, res);
>  	if (rc) {
>  		/*
>  		 * Platform-firmware may not have split resources like "System
> diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
> index d7d5d982ce69..4461f2a80d72 100644
> --- a/drivers/cxl/port.c
> +++ b/drivers/cxl/port.c
> @@ -89,6 +89,20 @@ static int cxl_switch_port_probe(struct cxl_port *port)
>  	return -ENXIO;
>  }
>  
> +static void cxl_sr_update(struct work_struct *w)
> +{
> +	merge_soft_reserve_resources();
> +}
> +
> +DECLARE_DELAYED_WORK(cxl_sr_work, cxl_sr_update);
> +
> +static void schedule_soft_reserve_update(void)
> +{
> +	int timeout = 5 * HZ;
> +
> +	mod_delayed_work(system_wq, &cxl_sr_work, timeout);
> +}

For cases where there is Soft Reserved CXL backed memory it should be
sufficient to just wait for initial device probing to complete. So I
would just have cxl_acpi_probe() call wait_for_device_probe() in a
workqueue, rather than try to guess at a timeout. If anything, waiting
for driver core deferred probing timeout seems a good time to ask "are
we missing any CXL memory ranges?".

> +
>  static int cxl_endpoint_port_probe(struct cxl_port *port)
>  {
>  	struct cxl_endpoint_dvsec_info info = { .port = port };
> @@ -140,6 +154,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
>  	 */
>  	device_for_each_child(&port->dev, root, discover_region);
>  
> +	schedule_soft_reserve_update();
>  	return 0;
>  }
>  
> diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
> index f9e1a76a04a9..c45791ad4858 100644
> --- a/drivers/dax/hmem/device.c
> +++ b/drivers/dax/hmem/device.c
> @@ -4,6 +4,7 @@
>  #include <linux/module.h>
>  #include <linux/dax.h>
>  #include <linux/mm.h>
> +#include "hmem.h"
>  
>  static bool nohmem;
>  module_param_named(disable, nohmem, bool, 0444);
> @@ -17,6 +18,9 @@ static struct resource hmem_active = {
>  	.flags = IORESOURCE_MEM,
>  };
>  
> +struct platform_device *hmem_pdev;
> +EXPORT_SYMBOL_GPL(hmem_pdev);
> +
>  int walk_hmem_resources(struct device *host, walk_hmem_fn fn)
>  {
>  	struct resource *res;
> @@ -35,7 +39,6 @@ EXPORT_SYMBOL_GPL(walk_hmem_resources);
>  
>  static void __hmem_register_resource(int target_nid, struct resource *res)
>  {
> -	struct platform_device *pdev;
>  	struct resource *new;
>  	int rc;
>  
> @@ -51,15 +54,15 @@ static void __hmem_register_resource(int target_nid, struct resource *res)
>  	if (platform_initialized)
>  		return;
>  
> -	pdev = platform_device_alloc("hmem_platform", 0);
> -	if (!pdev) {
> +	hmem_pdev = platform_device_alloc("hmem_platform", 0);
> +	if (!hmem_pdev) {
>  		pr_err_once("failed to register device-dax hmem_platform device\n");
>  		return;
>  	}
>  
> -	rc = platform_device_add(pdev);
> +	rc = platform_device_add(hmem_pdev);
>  	if (rc)
> -		platform_device_put(pdev);
> +		platform_device_put(hmem_pdev);
>  	else
>  		platform_initialized = true;

So, I don't think anyone actually cares which device parents a dax
device. It would be cleaner if cxl_acpi registered the Soft Reserved dax
devices that the hmem driver was told to skip.

That change eliminates the need for a notifier to trigger the hmem
driver to add devices after a CXL enumeration failure.

[ .. trim all the fine grained resource handling and notifier code .. ]

The end result of this effort is that the Linux CXL subsystem will
aggressively complain and refuse to run with platforms and devices that
deviate from common expectations. That gives space for Soft Reserved
generic support to fill some gaps while quirks, hacks, and workarounds
are developed to compensate for these deviations. Otherwise it has been
a constant drip of "what in the world is that platform doing?", and the
current policy of "try to depend on standard CXL enumeration" is too
fragile.
Dan Williams Dec. 11, 2024, 10:53 p.m. UTC | #9
Dan Williams wrote:
> Nathan Fontenot wrote:
> > Update handling of SOFT RESERVE iomem resources that intersect with
> > CXL region resources to remove the intersections from the SOFT RESERVE
> > resources. The current approach of leaving the SOFT RESERVE
> > resource as is can cause failures during hotplug replace of CXL
> > devices because the resource is not available for reuse after
> > teardown of the CXL device.
> > 
> > The approach is to trim out any pieces of SOFT RESERVE resources
> > that intersect CXL regions. To do this, first set aside any SOFT RESERVE
> > resources that intersect with a CFMWS into a separate resource tree
> > during e820__reserve_resources_late() that would have been otherwise
> > added to the iomem resource tree.
> > 
> > As CXL regions are created the cxl resource created for the new
> > region is used to trim intersections from the SOFT RESERVE
> > resources that were previously set aside.
> > 
> > Once CXL device probe has completed ant remaining SOFT RESERVE resources
> > remaining are added to the iomem resource tree. As each resource
> > is added to the oiomem resource tree a new notifier chain is invoked
> > to notify the dax driver of newly added SOFT RESERVE resources so that
> > the dax driver can consume them.
> 
> Hi Nathan, this patch hit on all the mechanisms I would expect, but upon
> reading it there is an opportunity to zoom out and do something blunter
> than the surgical precision of this current proposal.

Note that the reason I have new / enhanced focus on simplicity is that
this mechanism is probably our only way to resolve regressions like
this:

http://lore.kernel.org/d8d2c310-2021-431f-adbe-71ad0a17896a@amd.com

In other words, mainline is now failing to enumerate memory in more
scenarios than it was previously. I do not want to be forcing Linus and
the -stable team to to review wider changes to kernel/resource.c than is
necessary. I.e. there is a chance this proposal needs to be seek a
v6.13-rc consideration.
Alison Schofield Dec. 12, 2024, 2:07 a.m. UTC | #10
On Wed, Dec 11, 2024 at 02:30:59PM -0800, Dan Williams wrote:
> Nathan Fontenot wrote:
> > Update handling of SOFT RESERVE iomem resources that intersect with
> > CXL region resources to remove the intersections from the SOFT RESERVE
> > resources. The current approach of leaving the SOFT RESERVE
> > resource as is can cause failures during hotplug replace of CXL
> > devices because the resource is not available for reuse after
> > teardown of the CXL device.
> > 
> > The approach is to trim out any pieces of SOFT RESERVE resources
> > that intersect CXL regions. To do this, first set aside any SOFT RESERVE
> > resources that intersect with a CFMWS into a separate resource tree
> > during e820__reserve_resources_late() that would have been otherwise
> > added to the iomem resource tree.
> > 
> > As CXL regions are created the cxl resource created for the new
> > region is used to trim intersections from the SOFT RESERVE
> > resources that were previously set aside.
> > 
> > Once CXL device probe has completed ant remaining SOFT RESERVE resources
> > remaining are added to the iomem resource tree. As each resource
> > is added to the oiomem resource tree a new notifier chain is invoked
> > to notify the dax driver of newly added SOFT RESERVE resources so that
> > the dax driver can consume them.
> 
> Hi Nathan, this patch hit on all the mechanisms I would expect, but upon
> reading it there is an opportunity to zoom out and do something blunter
> than the surgical precision of this current proposal.
> 
> In other words, I appreciate the consideration of potential corner
> cases, but for overall maintainability this should aim to be an all or
> nothing approach.
> 
> Specifically, at the first sign of trouble, any CXL sub-driver probe
> failure or region enumeration timeout, that the entire CXL topology be
> torn down (trigger the equivalent of ->remove() on the ACPI0017 device),
> and the deferred Soft Reserved ranges registered as if cxl_acpi was not
> present (implement a fallback equivalent to hmem_register_devices()).
> 
> No need to trim resources as regions arrive, just tear down everything
> setup in the cxl_acpi_probe() path with devres_release_all().
> 
> So, I am thinking export a flag from the CXL core that indicates whether
> any conflict with platform-firmware established CXL regions has
> occurred.
> 
> Read that flag from an cxl_acpi-driver-launched deferred workqueue that
> is awaiting initial device probing to quiesce. If that flag indicates a
> CXL enumeration failure then trigger devres_release_all() on the
> ACPI0017 platform device and follow that up by walking the deferred Soft
> Reserve resources to register raw (unparented by CXL regions) dax
> devices.
> 

This reads like a 'poison pill' case that is in addition to the original
use cases that inspired this patch. I'm not sure I'm getting the 'all or
nothing' language.

The patch was doing Case 1) and 2). This poison-pill approach is 3)

1) SR aligns exactly with a CXL Window.
This patch removes that SR so the address space is available for reuse if
an auto-region is torn down.

2) SR is larger than a CXL Window.
Where the SR aligns with the CXL Window the SR is handled as in 1).
The SR leftovers are released to DAX.

3) Failure in auto-region assembly.
New case: tear down ACPI0017 and release all SRs.

So, after writing the above, maybe I get what the 'all' case is.
Are you suggesting we stop trimming and ignore leftovers and just
consider both 1) and 2) the 'all' case? No SR ever makes it into
the iomem resource tree so all resources are available for reuse
after teardown. And then there is 3), the nothing case! I get that.

It will waste some leftovers that could have been handed to DAX.
I know the SR greater than CXL Window came in handy for hotplug when
the SRs were not released, but I never understood if SRs greater
than CXL Window intended to serve any purpose.


> Some more comments below:

snip for now

>
Dan Williams Dec. 12, 2024, 3:20 a.m. UTC | #11
Alison Schofield wrote:
[..]
> > Hi Nathan, this patch hit on all the mechanisms I would expect, but upon
> > reading it there is an opportunity to zoom out and do something blunter
> > than the surgical precision of this current proposal.
> > 
> > In other words, I appreciate the consideration of potential corner
> > cases, but for overall maintainability this should aim to be an all or
> > nothing approach.
> > 
> > Specifically, at the first sign of trouble, any CXL sub-driver probe
> > failure or region enumeration timeout, that the entire CXL topology be
> > torn down (trigger the equivalent of ->remove() on the ACPI0017 device),
> > and the deferred Soft Reserved ranges registered as if cxl_acpi was not
> > present (implement a fallback equivalent to hmem_register_devices()).
> > 
> > No need to trim resources as regions arrive, just tear down everything
> > setup in the cxl_acpi_probe() path with devres_release_all().
> > 
> > So, I am thinking export a flag from the CXL core that indicates whether
> > any conflict with platform-firmware established CXL regions has
> > occurred.
> > 
> > Read that flag from an cxl_acpi-driver-launched deferred workqueue that
> > is awaiting initial device probing to quiesce. If that flag indicates a
> > CXL enumeration failure then trigger devres_release_all() on the
> > ACPI0017 platform device and follow that up by walking the deferred Soft
> > Reserve resources to register raw (unparented by CXL regions) dax
> > devices.
> > 
> 
> This reads like a 'poison pill' case that is in addition to the original
> use cases that inspired this patch. I'm not sure I'm getting the 'all or
> nothing' language.

Read through this recent regression thread:

   http://lore.kernel.org/20241202111941.2636613-1-raghavendra.kt@amd.com

That is evidence that the ways in which platform CXL configurations can
confuse the CXL subsystem are myriad and seemingly growing.

So this change of heart from me is realizing that the solution needed
here is less about how to recover the Soft Reserved ranges after CXL
successfully takes over, it is about abandoning the notion that the
driver should continue to best-effort fumble along after discovering
that a fundamental assumption has been violated.

For example this would also mitigate the "Extended Linear Cache"
enumeration problem:

    http://lore.kernel.org/20241204224827.2097263-1-dave.jiang@intel.com

...where the driver fools itself into thinking it knows what is
happening just based on reading hardware decoders, but not realizing
that DDR aliases into the same physical address range. Yes, you lose
important CXL subsystem services like address translation, but it is
arguably worse to confidently offer a bad translation based on wrong
assumption than offer no translation with an undecorated dax device, and
in the meantime the memory is usable.

The driver should stop guessing and say, "there is reason to believe I
have no idea what this platform is actually doing. The platform firmware
seems to think it knows what is coherent / enabled memory range. In the
interests of not dropping precious resources on the floor, or proceeding
with potential misconfiguration of the memory map, just revert to 100%
memory-mapped enumerated resources without CXL decoration."

> The patch was doing Case 1) and 2). This poison-pill approach is 3)

So it is not "poison-pill" or "sour grapes", it is "irresponsible to
continue when fundamental assumptions are violated, and wasteful to
leave the system with reduced memory capacity when platform firmware put
it in the memory map".

The "unwind SR on CXL region destruction" use case comes along for the
ride as a bonus feature.

> 
> 1) SR aligns exactly with a CXL Window.
> This patch removes that SR so the address space is available for reuse if
> an auto-region is torn down.
> 
> 2) SR is larger than a CXL Window.
> Where the SR aligns with the CXL Window the SR is handled as in 1).
> The SR leftovers are released to DAX.
> 
> 3) Failure in auto-region assembly.
> New case: tear down ACPI0017 and release all SRs.

release all SRs back to default enumeration behavior.

> So, after writing the above, maybe I get what the 'all' case is.
> Are you suggesting we stop trimming and ignore leftovers and just
> consider both 1) and 2) the 'all' case? No SR ever makes it into
> the iomem resource tree so all resources are available for reuse
> after teardown. And then there is 3), the nothing case! I get that.
> 
> It will waste some leftovers that could have been handed to DAX.
> I know the SR greater than CXL Window came in handy for hotplug when
> the SRs were not released, but I never understood if SRs greater
> than CXL Window intended to serve any purpose.

...and that is another example to prove the point. If the CXL subsystem
can not definitively say why it is trimming too large SR resources based
on the CXL Window size it should default to giving the end user all the
memory promised in the EFI memory map.
Fontenot, Nathan Dec. 12, 2024, 6:12 p.m. UTC | #12
On 12/11/2024 4:30 PM, Dan Williams wrote:
> Nathan Fontenot wrote:
>> Update handling of SOFT RESERVE iomem resources that intersect with
>> CXL region resources to remove the intersections from the SOFT RESERVE
>> resources. The current approach of leaving the SOFT RESERVE
>> resource as is can cause failures during hotplug replace of CXL
>> devices because the resource is not available for reuse after
>> teardown of the CXL device.
>>
>> The approach is to trim out any pieces of SOFT RESERVE resources
>> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
>> resources that intersect with a CFMWS into a separate resource tree
>> during e820__reserve_resources_late() that would have been otherwise
>> added to the iomem resource tree.
>>
>> As CXL regions are created the cxl resource created for the new
>> region is used to trim intersections from the SOFT RESERVE
>> resources that were previously set aside.
>>
>> Once CXL device probe has completed ant remaining SOFT RESERVE resources
>> remaining are added to the iomem resource tree. As each resource
>> is added to the oiomem resource tree a new notifier chain is invoked
>> to notify the dax driver of newly added SOFT RESERVE resources so that
>> the dax driver can consume them.
> 
> Hi Nathan, this patch hit on all the mechanisms I would expect, but upon
> reading it there is an opportunity to zoom out and do something blunter
> than the surgical precision of this current proposal.
> 
> In other words, I appreciate the consideration of potential corner
> cases, but for overall maintainability this should aim to be an all or
> nothing approach.
> 
> Specifically, at the first sign of trouble, any CXL sub-driver probe
> failure or region enumeration timeout, that the entire CXL topology be
> torn down (trigger the equivalent of ->remove() on the ACPI0017 device),
> and the deferred Soft Reserved ranges registered as if cxl_acpi was not
> present (implement a fallback equivalent to hmem_register_devices()).
> 
> No need to trim resources as regions arrive, just tear down everything
> setup in the cxl_acpi_probe() path with devres_release_all().
> 
> So, I am thinking export a flag from the CXL core that indicates whether
> any conflict with platform-firmware established CXL regions has
> occurred.
> 
> Read that flag from an cxl_acpi-driver-launched deferred workqueue that
> is awaiting initial device probing to quiesce. If that flag indicates a
> CXL enumeration failure then trigger devres_release_all() on the
> ACPI0017 platform device and follow that up by walking the deferred Soft
> Reserve resources to register raw (unparented by CXL regions) dax
> devices.
> 

From my initial reading of this are you suggesting we tear down all
CXL devices for any single failure? That seems a bit extreme. If I'm
reading that wrong let me know.

If you remember I did try an approach previously using a wait_for_device_probe()
in the cxl_acpi driver. This didn't work because the wait would return before
probe would complete of the CXL devices. From what I saw in the
wait_for_device_probe() code is that it waits for drivers registered at the
time it is called which ends up being before the other cxl drivers are registered.

This was the reason to switch a deferred workqueue approach. I do agree that
this can become a guessing game on how long to wait and is likely to not wait
long enough for a given configuration.

I'm open to suggestions for other approaches from anyone on determining
when CXL device probe completes.

> Some more comments below:
> 
>> Signed-off-by: Nathan Fontenot <nathan.fontenot@amd.com>
>> ---
>>  arch/x86/kernel/e820.c    |  17 ++++-
>>  drivers/cxl/core/region.c |   8 +-
>>  drivers/cxl/port.c        |  15 ++++
>>  drivers/dax/hmem/device.c |  13 ++--
>>  drivers/dax/hmem/hmem.c   |  15 ++++
>>  drivers/dax/hmem/hmem.h   |  11 +++
>>  include/linux/dax.h       |   4 -
>>  include/linux/ioport.h    |   6 ++
>>  kernel/resource.c         | 155 +++++++++++++++++++++++++++++++++++++-
>>  9 files changed, 229 insertions(+), 15 deletions(-)
>>  create mode 100644 drivers/dax/hmem/hmem.h
>>
>> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
>> index 4893d30ce438..cab82e9324a5 100644
>> --- a/arch/x86/kernel/e820.c
>> +++ b/arch/x86/kernel/e820.c
>> @@ -1210,14 +1210,23 @@ static unsigned long __init ram_alignment(resource_size_t pos)
>>  
>>  void __init e820__reserve_resources_late(void)
>>  {
>> -	int i;
>>  	struct resource *res;
>> +	int i;
>>  
>> +	/*
>> +	 * Prior to inserting SOFT_RESERVED resources we want to check for an
>> +	 * intersection with potential CXL resources. Any SOFT_RESERVED resources
>> +	 * that do intersect a potential CXL resource are set aside so they
>> +	 * can be trimmed to accommodate CXL resource intersections and added to
>> +	 * the iomem resource tree after the CXL drivers have completed their
>> +	 * device probe.
> 
> Perhaps shorten to "see hmem_register_devices() and cxl_acpi_probe() for
> deferred initialization of Soft Reserved ranges"
> 
>> +	 */
>>  	res = e820_res;
>> -	for (i = 0; i < e820_table->nr_entries; i++) {
>> -		if (!res->parent && res->end)
>> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {
>> +		if (res->desc == IORES_DESC_SOFT_RESERVED)
>> +			insert_soft_reserve_resource(res);
> 
> I would only expect this deferral to happen when CONFIG_DEV_DAX_HMEM
> and/or CONFIG_CXL_REGION  is enabled. It also needs to catch Soft
> Reserved deferral on other, non-e820 based, archs. So, maybe this hackery
> should be done internal to insert_resource_*(). Something like all
> insert_resource() of IORES_DESC_SOFT_RESERVED is deferred until a flag
> is flipped allowing future insertion attempts to succeed in adding them
> to the ioresource_mem tree.
>

Good point on non-e820 archs. I can move the check insert_resource() and
add checks for CONFIG_DEV_DAX_HMEM/CONFIG_CXL_REGION enablement.  

> Not that I expect this problem will ever effect more than just CXL, but
> it is already the case that Soft Reserved is set for more than just CXL
> ranges, and who know what other backend Soft Reserved consumer drivers
> might arrive later.
> 
> When CXL or HMEM parses the deferred entries they can take
> responsibility for injecting the Soft Reserved entries. That achieves
> continuity of the /proc/iomem contents across kernel versions while
> giving those endpoint drivers the ability to unregister those resources.
> 
>> +		else if (!res->parent && res->end)
>>  			insert_resource_expand_to_fit(&iomem_resource, res);
>> -		res++;
>>  	}
>>  
>>  	/*
>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>> index 21ad5f242875..c458a6313b31 100644
>> --- a/drivers/cxl/core/region.c
>> +++ b/drivers/cxl/core/region.c
>> @@ -3226,6 +3226,12 @@ static int match_region_by_range(struct device *dev, void *data)
>>  	return rc;
>>  }
>>  
>> +static int insert_region_resource(struct resource *parent, struct resource *res)
>> +{
>> +	trim_soft_reserve_resources(res);
>> +	return insert_resource(parent, res);
>> +}
> 
> Per above, lets not do dynamic trimming, it's all or nothing CXL memory
> enumeration if the driver is trying and failing to parse any part of the
> BIOS-established CXL configuration.

That can be done. I felt it was easier to trim the SR resources as CXL regions
were created instead of going back and finding all the CXL regions after all
device probe completed and trimming them.

> 
> Yes, this could result in regressions in the other direction, but my
> expectation is that the vast majority of CXL memory present at boot is
> meant to be indistinguishable from DDR. In other words the current
> default of "lose access to memory upon CXL enumeration failure that is
> otherwise fully described by the EFI Memory Map" is the wrong default
> policy.
> 
>> +
>>  /* Establish an empty region covering the given HPA range */
>>  static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>>  					   struct cxl_endpoint_decoder *cxled)
>> @@ -3272,7 +3278,7 @@ static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
>>  
>>  	*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
>>  				    dev_name(&cxlr->dev));
>> -	rc = insert_resource(cxlrd->res, res);
>> +	rc = insert_region_resource(cxlrd->res, res);
>>  	if (rc) {
>>  		/*
>>  		 * Platform-firmware may not have split resources like "System
>> diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
>> index d7d5d982ce69..4461f2a80d72 100644
>> --- a/drivers/cxl/port.c
>> +++ b/drivers/cxl/port.c
>> @@ -89,6 +89,20 @@ static int cxl_switch_port_probe(struct cxl_port *port)
>>  	return -ENXIO;
>>  }
>>  
>> +static void cxl_sr_update(struct work_struct *w)
>> +{
>> +	merge_soft_reserve_resources();
>> +}
>> +
>> +DECLARE_DELAYED_WORK(cxl_sr_work, cxl_sr_update);
>> +
>> +static void schedule_soft_reserve_update(void)
>> +{
>> +	int timeout = 5 * HZ;
>> +
>> +	mod_delayed_work(system_wq, &cxl_sr_work, timeout);
>> +}
> 
> For cases where there is Soft Reserved CXL backed memory it should be
> sufficient to just wait for initial device probing to complete. So I
> would just have cxl_acpi_probe() call wait_for_device_probe() in a
> workqueue, rather than try to guess at a timeout. If anything, waiting
> for driver core deferred probing timeout seems a good time to ask "are
> we missing any CXL memory ranges?".
> 
>> +
>>  static int cxl_endpoint_port_probe(struct cxl_port *port)
>>  {
>>  	struct cxl_endpoint_dvsec_info info = { .port = port };
>> @@ -140,6 +154,7 @@ static int cxl_endpoint_port_probe(struct cxl_port *port)
>>  	 */
>>  	device_for_each_child(&port->dev, root, discover_region);
>>  
>> +	schedule_soft_reserve_update();
>>  	return 0;
>>  }
>>  
>> diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
>> index f9e1a76a04a9..c45791ad4858 100644
>> --- a/drivers/dax/hmem/device.c
>> +++ b/drivers/dax/hmem/device.c
>> @@ -4,6 +4,7 @@
>>  #include <linux/module.h>
>>  #include <linux/dax.h>
>>  #include <linux/mm.h>
>> +#include "hmem.h"
>>  
>>  static bool nohmem;
>>  module_param_named(disable, nohmem, bool, 0444);
>> @@ -17,6 +18,9 @@ static struct resource hmem_active = {
>>  	.flags = IORESOURCE_MEM,
>>  };
>>  
>> +struct platform_device *hmem_pdev;
>> +EXPORT_SYMBOL_GPL(hmem_pdev);
>> +
>>  int walk_hmem_resources(struct device *host, walk_hmem_fn fn)
>>  {
>>  	struct resource *res;
>> @@ -35,7 +39,6 @@ EXPORT_SYMBOL_GPL(walk_hmem_resources);
>>  
>>  static void __hmem_register_resource(int target_nid, struct resource *res)
>>  {
>> -	struct platform_device *pdev;
>>  	struct resource *new;
>>  	int rc;
>>  
>> @@ -51,15 +54,15 @@ static void __hmem_register_resource(int target_nid, struct resource *res)
>>  	if (platform_initialized)
>>  		return;
>>  
>> -	pdev = platform_device_alloc("hmem_platform", 0);
>> -	if (!pdev) {
>> +	hmem_pdev = platform_device_alloc("hmem_platform", 0);
>> +	if (!hmem_pdev) {
>>  		pr_err_once("failed to register device-dax hmem_platform device\n");
>>  		return;
>>  	}
>>  
>> -	rc = platform_device_add(pdev);
>> +	rc = platform_device_add(hmem_pdev);
>>  	if (rc)
>> -		platform_device_put(pdev);
>> +		platform_device_put(hmem_pdev);
>>  	else
>>  		platform_initialized = true;
> 
> So, I don't think anyone actually cares which device parents a dax
> device. It would be cleaner if cxl_acpi registered the Soft Reserved dax
> devices that the hmem driver was told to skip.
> 
> That change eliminates the need for a notifier to trigger the hmem
> driver to add devices after a CXL enumeration failure.

ok, I do like this better than the addition of a notification chain
added to kernel/resource.c for what felt like a one time notification.

-Nathan

> 
> [ .. trim all the fine grained resource handling and notifier code .. ]
> 
> The end result of this effort is that the Linux CXL subsystem will
> aggressively complain and refuse to run with platforms and devices that
> deviate from common expectations. That gives space for Soft Reserved
> generic support to fill some gaps while quirks, hacks, and workarounds
> are developed to compensate for these deviations. Otherwise it has been
> a constant drip of "what in the world is that platform doing?", and the
> current policy of "try to depend on standard CXL enumeration" is too
> fragile.
Dan Williams Dec. 12, 2024, 7:57 p.m. UTC | #13
Fontenot, Nathan wrote:
> 
> 
> On 12/11/2024 4:30 PM, Dan Williams wrote:
> > Nathan Fontenot wrote:
> >> Update handling of SOFT RESERVE iomem resources that intersect with
> >> CXL region resources to remove the intersections from the SOFT RESERVE
> >> resources. The current approach of leaving the SOFT RESERVE
> >> resource as is can cause failures during hotplug replace of CXL
> >> devices because the resource is not available for reuse after
> >> teardown of the CXL device.
> >>
> >> The approach is to trim out any pieces of SOFT RESERVE resources
> >> that intersect CXL regions. To do this, first set aside any SOFT RESERVE
> >> resources that intersect with a CFMWS into a separate resource tree
> >> during e820__reserve_resources_late() that would have been otherwise
> >> added to the iomem resource tree.
> >>
> >> As CXL regions are created the cxl resource created for the new
> >> region is used to trim intersections from the SOFT RESERVE
> >> resources that were previously set aside.
> >>
> >> Once CXL device probe has completed ant remaining SOFT RESERVE resources
> >> remaining are added to the iomem resource tree. As each resource
> >> is added to the oiomem resource tree a new notifier chain is invoked
> >> to notify the dax driver of newly added SOFT RESERVE resources so that
> >> the dax driver can consume them.
> > 
> > Hi Nathan, this patch hit on all the mechanisms I would expect, but upon
> > reading it there is an opportunity to zoom out and do something blunter
> > than the surgical precision of this current proposal.
> > 
> > In other words, I appreciate the consideration of potential corner
> > cases, but for overall maintainability this should aim to be an all or
> > nothing approach.
> > 
> > Specifically, at the first sign of trouble, any CXL sub-driver probe
> > failure or region enumeration timeout, that the entire CXL topology be
> > torn down (trigger the equivalent of ->remove() on the ACPI0017 device),
> > and the deferred Soft Reserved ranges registered as if cxl_acpi was not
> > present (implement a fallback equivalent to hmem_register_devices()).
> > 
> > No need to trim resources as regions arrive, just tear down everything
> > setup in the cxl_acpi_probe() path with devres_release_all().
> > 
> > So, I am thinking export a flag from the CXL core that indicates whether
> > any conflict with platform-firmware established CXL regions has
> > occurred.
> > 
> > Read that flag from an cxl_acpi-driver-launched deferred workqueue that
> > is awaiting initial device probing to quiesce. If that flag indicates a
> > CXL enumeration failure then trigger devres_release_all() on the
> > ACPI0017 platform device and follow that up by walking the deferred Soft
> > Reserve resources to register raw (unparented by CXL regions) dax
> > devices.
> > 
> 
> From my initial reading of this are you suggesting we tear down all
> CXL devices for any single failure? That seems a bit extreme. If I'm
> reading that wrong let me know.

I would say "comprehensive" rather than extreme. It could be softened a
bit to say that any incomplete or unhandled soft-reserved ranges get
returned to pure dax operation.

Otherwise the fact that this patch as is did not resolve the recent
regression report [1] is a sign that it needs to be more aggressive.

So if a softened approach can be done with low complexity, lets do it,
but the status quo is already extreme: end users are losing access to
their memory because the CXL subsystem fails to interpret the
configuration.

I.e. the minimum requirement for me is that all Soft Reserved ranges end
up as dax-devices.

[1]: http://lore.kernel.org/d8d2c310-2021-431f-adbe-71ad0a17896a@amd.com

> If you remember I did try an approach previously using a wait_for_device_probe()
> in the cxl_acpi driver. This didn't work because the wait would return before
> probe would complete of the CXL devices. From what I saw in the
> wait_for_device_probe() code is that it waits for drivers registered at the
> time it is called which ends up being before the other cxl drivers are registered.

Right, I think that's solvable, especially now that some of the init
fixes have gone in [2] where cxl_acpi knows that the cxl_port driver has
had a chance to load.

It might require a gate like:

    cxl_acpi_flush_probe(...) {
    
        wait_event(<cxl_pci && cxl_mem have signaled that they have loaded>)
    
        wait_for_device_probe()
        ....
    }

[2]: http://lore.kernel.org/172964779333.81806.8852577918216421011.stgit@dwillia2-xfh.jf.intel.com

> This was the reason to switch a deferred workqueue approach. I do agree that
> this can become a guessing game on how long to wait and is likely to not wait
> long enough for a given configuration.
> 
> I'm open to suggestions for other approaches from anyone on determining
> when CXL device probe completes.
> 
> > Some more comments below:
> > 
> >> Signed-off-by: Nathan Fontenot <nathan.fontenot@amd.com>
> >> ---
> >>  arch/x86/kernel/e820.c    |  17 ++++-
> >>  drivers/cxl/core/region.c |   8 +-
> >>  drivers/cxl/port.c        |  15 ++++
> >>  drivers/dax/hmem/device.c |  13 ++--
> >>  drivers/dax/hmem/hmem.c   |  15 ++++
> >>  drivers/dax/hmem/hmem.h   |  11 +++
> >>  include/linux/dax.h       |   4 -
> >>  include/linux/ioport.h    |   6 ++
> >>  kernel/resource.c         | 155 +++++++++++++++++++++++++++++++++++++-
> >>  9 files changed, 229 insertions(+), 15 deletions(-)
> >>  create mode 100644 drivers/dax/hmem/hmem.h
> >>
> >> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
> >> index 4893d30ce438..cab82e9324a5 100644
> >> --- a/arch/x86/kernel/e820.c
> >> +++ b/arch/x86/kernel/e820.c
> >> @@ -1210,14 +1210,23 @@ static unsigned long __init ram_alignment(resource_size_t pos)
> >>  
> >>  void __init e820__reserve_resources_late(void)
> >>  {
> >> -	int i;
> >>  	struct resource *res;
> >> +	int i;
> >>  
> >> +	/*
> >> +	 * Prior to inserting SOFT_RESERVED resources we want to check for an
> >> +	 * intersection with potential CXL resources. Any SOFT_RESERVED resources
> >> +	 * that do intersect a potential CXL resource are set aside so they
> >> +	 * can be trimmed to accommodate CXL resource intersections and added to
> >> +	 * the iomem resource tree after the CXL drivers have completed their
> >> +	 * device probe.
> > 
> > Perhaps shorten to "see hmem_register_devices() and cxl_acpi_probe() for
> > deferred initialization of Soft Reserved ranges"
> > 
> >> +	 */
> >>  	res = e820_res;
> >> -	for (i = 0; i < e820_table->nr_entries; i++) {
> >> -		if (!res->parent && res->end)
> >> +	for (i = 0; i < e820_table->nr_entries; i++, res++) {
> >> +		if (res->desc == IORES_DESC_SOFT_RESERVED)
> >> +			insert_soft_reserve_resource(res);
> > 
> > I would only expect this deferral to happen when CONFIG_DEV_DAX_HMEM
> > and/or CONFIG_CXL_REGION  is enabled. It also needs to catch Soft
> > Reserved deferral on other, non-e820 based, archs. So, maybe this hackery
> > should be done internal to insert_resource_*(). Something like all
> > insert_resource() of IORES_DESC_SOFT_RESERVED is deferred until a flag
> > is flipped allowing future insertion attempts to succeed in adding them
> > to the ioresource_mem tree.
> >
> 
> Good point on non-e820 archs. I can move the check insert_resource() and
> add checks for CONFIG_DEV_DAX_HMEM/CONFIG_CXL_REGION enablement.  

Perhaps it can be more general with a new:

config SOFT_RESERVE_CONSUMER
	bool

...which drivers, that scan the soft-reserve resources for ranges that
belong to them like DEV_DAX_HMEM and CXL_REGION, select. 

I am assuming that we want to define a point in the kernel init flow
where insert_resource() stops deferring Soft Reserved to this lookaside
tree, just not sure where that should be.

> 
> > Not that I expect this problem will ever effect more than just CXL, but
> > it is already the case that Soft Reserved is set for more than just CXL
> > ranges, and who know what other backend Soft Reserved consumer drivers
> > might arrive later.
> > 
> > When CXL or HMEM parses the deferred entries they can take
> > responsibility for injecting the Soft Reserved entries. That achieves
> > continuity of the /proc/iomem contents across kernel versions while
> > giving those endpoint drivers the ability to unregister those resources.
> > 
> >> +		else if (!res->parent && res->end)
> >>  			insert_resource_expand_to_fit(&iomem_resource, res);
> >> -		res++;
> >>  	}
> >>  
> >>  	/*
> >> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> >> index 21ad5f242875..c458a6313b31 100644
> >> --- a/drivers/cxl/core/region.c
> >> +++ b/drivers/cxl/core/region.c
> >> @@ -3226,6 +3226,12 @@ static int match_region_by_range(struct device *dev, void *data)
> >>  	return rc;
> >>  }
> >>  
> >> +static int insert_region_resource(struct resource *parent, struct resource *res)
> >> +{
> >> +	trim_soft_reserve_resources(res);
> >> +	return insert_resource(parent, res);
> >> +}
> > 
> > Per above, lets not do dynamic trimming, it's all or nothing CXL memory
> > enumeration if the driver is trying and failing to parse any part of the
> > BIOS-established CXL configuration.
> 
> That can be done. I felt it was easier to trim the SR resources as CXL regions
> were created instead of going back and finding all the CXL regions after all
> device probe completed and trimming them.

The problem is that at resource creation time in construct_region() it
is unknown whether that region will actually complete assembly. That is
why I was thinking that committing to CXL taking over the range needs to
be after that "all devices present at boot have probably finished
probing" event.

The easiest is undo everything on failure since that does not require
new code, but we could do the softer, "trim everything that got to the
point of succeeding cxl_region_probe(), and fallback to undecorated
dax-devices for the rest".
Gregory Price Dec. 12, 2024, 10:42 p.m. UTC | #14
On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
... snip ...
> diff --git a/kernel/resource.c b/kernel/resource.c
> index a83040fde236..8fc4121a1887 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
... snip ...
> +static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
> +			     void *arg, const unsigned long unused)
> +{

Chiming in a little late to the party here

I don't think encoding CXL-specific terminology and functionality
directly into kernel/resource.c is wise by any measure.

The abstraction here is completely inverted, and this is probably
in line with Dan's comments.

The comments in e820.c allude to a similar issue

    * Prior to inserting SOFT_RESERVED resources we want to check
    * for an intersection with potential CXL resources.

This is similarly inverted - e820 doesn't know anthing about CXL
and it shouldn't have to be made aware of CXL. Mucking with
e820 is *begging* to be bitten elsewhere in parts of the system
that depend on it to be a relatively stable source of truth.


This tells me that this patch is trying to solve the wrong problem.


Your changelog alludes to supporting hotplug replace

"""
  The current approach of leaving the SOFT RESERVE resource as is can
  cause failure during hotplug replace of CXL devices because the 
  resource is not available for reuse after teardown of the CXL device.
"""

It sounds like we should be making the SR resource available for
re-use through proper teardown and cleanup of the resource tree,
rather than trying to change fundamental components like e820.

If the driver was capable of using the SOFT_RESERVED region on
initial setup, it should be capable of re-using that region.


Is the issue here that the hotplug-replaced component has a
different capacity? It it being assigned a new region entirely?
Is it exactly the same, but the resource isn't being cleaned up?

Can you provide more specifics about the exact hotplug interaction
that is happening? That might help understand the issue a bit better.


Much of this sounds like we need additional better tear-down
management and possibly additional cxl/acpi features to handle 
hotplug of these devices - rather than changing resource.c.


~Gregory

> +	struct acpi_cedt_cfmws *cfmws;
> +	struct srmem_arg *args = arg;
> +	struct resource cfmws_res;
> +	struct resource *res;
> +
> +	res = args->res;
> +
> +	cfmws = (struct acpi_cedt_cfmws *)hdr;
> +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
> +				   cfmws->base_hpa + cfmws->window_size);
> +
> +	if (resource_overlaps(&cfmws_res, res)) {
> +		args->overlaps += 1;
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static bool resource_overlaps_cfmws(struct resource *res)
> +{
> +	struct srmem_arg arg = {
> +		.res = res,
> +		.overlaps = 0
> +	};
> +
> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
> +
> +	if (arg.overlaps)
> +		return true;
> +
> +	return false;
> +}
> +
> +int insert_soft_reserve_resource(struct resource *res)
> +{
> +	if (resource_overlaps_cfmws(res)) {
> +		pr_info("Reserving Soft Reserve %pr\n", res);
> +		return insert_resource(&srmem_resource, res);
> +	}
> +
> +	return insert_resource(&iomem_resource, res);
> +}
> +EXPORT_SYMBOL(insert_soft_reserve_resource);
> +
>  static void __init
>  __reserve_region_with_split(struct resource *root, resource_size_t start,
>  			    resource_size_t end, const char *name)
> -- 
> 2.43.0
>
Alison Schofield Dec. 13, 2024, 1:01 a.m. UTC | #15
On Thu, Dec 12, 2024 at 05:42:08PM -0500, Gregory Price wrote:
> On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
> ... snip ...
> > diff --git a/kernel/resource.c b/kernel/resource.c
> > index a83040fde236..8fc4121a1887 100644
> > --- a/kernel/resource.c
> > +++ b/kernel/resource.c
> ... snip ...
> > +static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
> > +			     void *arg, const unsigned long unused)
> > +{
> 
> Chiming in a little late to the party here
> 
> I don't think encoding CXL-specific terminology and functionality
> directly into kernel/resource.c is wise by any measure.
> 
> The abstraction here is completely inverted, and this is probably
> in line with Dan's comments.
> 
> The comments in e820.c allude to a similar issue
> 
>     * Prior to inserting SOFT_RESERVED resources we want to check
>     * for an intersection with potential CXL resources.
> 
> This is similarly inverted - e820 doesn't know anthing about CXL
> and it shouldn't have to be made aware of CXL. Mucking with
> e820 is *begging* to be bitten elsewhere in parts of the system
> that depend on it to be a relatively stable source of truth.
> 
> 
> This tells me that this patch is trying to solve the wrong problem.
> 
> 
> Your changelog alludes to supporting hotplug replace
> 
> """
>   The current approach of leaving the SOFT RESERVE resource as is can
>   cause failure during hotplug replace of CXL devices because the 
>   resource is not available for reuse after teardown of the CXL device.
> """
> 
> It sounds like we should be making the SR resource available for
> re-use through proper teardown and cleanup of the resource tree,
> rather than trying to change fundamental components like e820.
> 
> If the driver was capable of using the SOFT_RESERVED region on
> initial setup, it should be capable of re-using that region.
> 
> 
> Is the issue here that the hotplug-replaced component has a
> different capacity? It it being assigned a new region entirely?
> Is it exactly the same, but the resource isn't being cleaned up?
> 
> Can you provide more specifics about the exact hotplug interaction
> that is happening? That might help understand the issue a bit better.
> 
> 
> Much of this sounds like we need additional better tear-down
> management and possibly additional cxl/acpi features to handle 
> hotplug of these devices - rather than changing resource.c.

Hi Gregory,

Never too late, and it serves to revisit and remember to carry
along some of the why behind this patch as we adopt it to do more.
Admittedly, the more, has also become the more important here!

I'm not sure what Nathan first saw that led him to this issue.
For me it was - 

BIOS labels a resource Soft Reserved and programs a region using
that range. Later, the existing cxl path to destroy that region
does not free up that Soft Reserved range. Users cannot create
another region in it's place. Resource lost. We considered simply
removing soft reserved resources on region teardown, and you can
probably find a patches on lore doing just that.

But - the problem grew. Sometimes BIOS creates an SR that is not
aligned with the region they go on to program. Stranded resources.
That's where the trim and give to DAX path originated.

But - the problem grew. Sometimes the CXL driver fails to enumerate
that BIOS defined region. More stranded resources. Let's find those
too and give them to DAX. This is something we are seeing in the
wild now and why Dan raised its priority.

Dan is also suggesting that at that last event - failure to enumerate
a BIOS defined region, we tear down the entire ACPI0017 toplogy
and give everything to DAX.

What Dan called, "the minimum requirement": all Soft Reserved ranges
end up as dax-devices sounds like the right guideline moving forward.

Hope that answers some of your why questions about the SR reuse.

It seems the worst problem, failure to enumerate a region, kind of
drives the solution now, and we should handle the soft reserveds
all the same. Even if Nathan doesn't implement it in one final
sweep through the Soft Reserveds, one way or another, those SR
ranges must end up as dax-device resources - either CXL managed
or BIOS Soft Reserved.

--Alison

> 
> 
> ~Gregory
> 
> > +	struct acpi_cedt_cfmws *cfmws;
> > +	struct srmem_arg *args = arg;
> > +	struct resource cfmws_res;
> > +	struct resource *res;
> > +
> > +	res = args->res;
> > +
> > +	cfmws = (struct acpi_cedt_cfmws *)hdr;
> > +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
> > +				   cfmws->base_hpa + cfmws->window_size);
> > +
> > +	if (resource_overlaps(&cfmws_res, res)) {
> > +		args->overlaps += 1;
> > +		return 1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static bool resource_overlaps_cfmws(struct resource *res)
> > +{
> > +	struct srmem_arg arg = {
> > +		.res = res,
> > +		.overlaps = 0
> > +	};
> > +
> > +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
> > +
> > +	if (arg.overlaps)
> > +		return true;
> > +
> > +	return false;
> > +}
> > +
> > +int insert_soft_reserve_resource(struct resource *res)
> > +{
> > +	if (resource_overlaps_cfmws(res)) {
> > +		pr_info("Reserving Soft Reserve %pr\n", res);
> > +		return insert_resource(&srmem_resource, res);
> > +	}
> > +
> > +	return insert_resource(&iomem_resource, res);
> > +}
> > +EXPORT_SYMBOL(insert_soft_reserve_resource);
> > +
> >  static void __init
> >  __reserve_region_with_split(struct resource *root, resource_size_t start,
> >  			    resource_size_t end, const char *name)
> > -- 
> > 2.43.0
> >
Fontenot, Nathan Dec. 13, 2024, 4:33 p.m. UTC | #16
On 12/12/2024 7:01 PM, Alison Schofield wrote:
> On Thu, Dec 12, 2024 at 05:42:08PM -0500, Gregory Price wrote:
>> On Mon, Dec 02, 2024 at 09:55:42AM -0600, Nathan Fontenot wrote:
>> ... snip ...
>>> diff --git a/kernel/resource.c b/kernel/resource.c
>>> index a83040fde236..8fc4121a1887 100644
>>> --- a/kernel/resource.c
>>> +++ b/kernel/resource.c
>> ... snip ...
>>> +static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
>>> +			     void *arg, const unsigned long unused)
>>> +{
>>
>> Chiming in a little late to the party here
>>
>> I don't think encoding CXL-specific terminology and functionality
>> directly into kernel/resource.c is wise by any measure.
>>
>> The abstraction here is completely inverted, and this is probably
>> in line with Dan's comments.
>>
>> The comments in e820.c allude to a similar issue
>>
>>     * Prior to inserting SOFT_RESERVED resources we want to check
>>     * for an intersection with potential CXL resources.
>>
>> This is similarly inverted - e820 doesn't know anthing about CXL
>> and it shouldn't have to be made aware of CXL. Mucking with
>> e820 is *begging* to be bitten elsewhere in parts of the system
>> that depend on it to be a relatively stable source of truth.
>>
>>
>> This tells me that this patch is trying to solve the wrong problem.
>>
>>
>> Your changelog alludes to supporting hotplug replace
>>
>> """
>>   The current approach of leaving the SOFT RESERVE resource as is can
>>   cause failure during hotplug replace of CXL devices because the 
>>   resource is not available for reuse after teardown of the CXL device.
>> """
>>
>> It sounds like we should be making the SR resource available for
>> re-use through proper teardown and cleanup of the resource tree,
>> rather than trying to change fundamental components like e820.
>>
>> If the driver was capable of using the SOFT_RESERVED region on
>> initial setup, it should be capable of re-using that region.
>>
>>
>> Is the issue here that the hotplug-replaced component has a
>> different capacity? It it being assigned a new region entirely?
>> Is it exactly the same, but the resource isn't being cleaned up?
>>
>> Can you provide more specifics about the exact hotplug interaction
>> that is happening? That might help understand the issue a bit better.
>>
>>
>> Much of this sounds like we need additional better tear-down
>> management and possibly additional cxl/acpi features to handle 
>> hotplug of these devices - rather than changing resource.c.
> 
> Hi Gregory,
> 
> Never too late, and it serves to revisit and remember to carry
> along some of the why behind this patch as we adopt it to do more.
> Admittedly, the more, has also become the more important here!
> 
> I'm not sure what Nathan first saw that led him to this issue.
> For me it was - 
> 
> BIOS labels a resource Soft Reserved and programs a region using
> that range. Later, the existing cxl path to destroy that region
> does not free up that Soft Reserved range. Users cannot create
> another region in it's place. Resource lost. We considered simply
> removing soft reserved resources on region teardown, and you can
> probably find a patches on lore doing just that.

Yes, this is the same problem I was seeing and looking to solve.

> 
> But - the problem grew. Sometimes BIOS creates an SR that is not
> aligned with the region they go on to program. Stranded resources.
> That's where the trim and give to DAX path originated.
> 
> But - the problem grew. Sometimes the CXL driver fails to enumerate
> that BIOS defined region. More stranded resources. Let's find those
> too and give them to DAX. This is something we are seeing in the
> wild now and why Dan raised its priority.
> 
> Dan is also suggesting that at that last event - failure to enumerate
> a BIOS defined region, we tear down the entire ACPI0017 toplogy
> and give everything to DAX.
> 
> What Dan called, "the minimum requirement": all Soft Reserved ranges
> end up as dax-devices sounds like the right guideline moving forward.
> 
> Hope that answers some of your why questions about the SR reuse.
>

Thanks for the summery on where the patch started and where we're
currently at. I think it will help everyone.
 
> It seems the worst problem, failure to enumerate a region, kind of
> drives the solution now, and we should handle the soft reserveds
> all the same. Even if Nathan doesn't implement it in one final
> sweep through the Soft Reserveds, one way or another, those SR
> ranges must end up as dax-device resources - either CXL managed
> or BIOS Soft Reserved.
> 

This leads us to how to design the handling of soft reserves from
a larger system wide perspective, not just the CXL specific focus
I have come at this with in my patches.

If we start with setting aside soft reserved resources on a separate
list from iomem resources at boot, we could then let drivers use that
to either remove pieces that the driver uses and release unused pieces
to the iomem tree. One thing to consider when setting aside soft reserves
is when to do it. If CXL isn't configured we don't want to set these
aside, Dan mentioned a possible config option to control this.

For CXL, I think the approach Dan is asking for is to wait for CXL
probe to complete and the walk through all created regions and remove
the intersecting pieces from the soft reserves. Any soft reserves
remaining could then be released and used as dax devices. Dan, please
correct me if I got that wrong.

-Nathan

> --Alison
> 
>>
>>
>> ~Gregory
>>
>>> +	struct acpi_cedt_cfmws *cfmws;
>>> +	struct srmem_arg *args = arg;
>>> +	struct resource cfmws_res;
>>> +	struct resource *res;
>>> +
>>> +	res = args->res;
>>> +
>>> +	cfmws = (struct acpi_cedt_cfmws *)hdr;
>>> +	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
>>> +				   cfmws->base_hpa + cfmws->window_size);
>>> +
>>> +	if (resource_overlaps(&cfmws_res, res)) {
>>> +		args->overlaps += 1;
>>> +		return 1;
>>> +	}
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static bool resource_overlaps_cfmws(struct resource *res)
>>> +{
>>> +	struct srmem_arg arg = {
>>> +		.res = res,
>>> +		.overlaps = 0
>>> +	};
>>> +
>>> +	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
>>> +
>>> +	if (arg.overlaps)
>>> +		return true;
>>> +
>>> +	return false;
>>> +}
>>> +
>>> +int insert_soft_reserve_resource(struct resource *res)
>>> +{
>>> +	if (resource_overlaps_cfmws(res)) {
>>> +		pr_info("Reserving Soft Reserve %pr\n", res);
>>> +		return insert_resource(&srmem_resource, res);
>>> +	}
>>> +
>>> +	return insert_resource(&iomem_resource, res);
>>> +}
>>> +EXPORT_SYMBOL(insert_soft_reserve_resource);
>>> +
>>>  static void __init
>>>  __reserve_region_with_split(struct resource *root, resource_size_t start,
>>>  			    resource_size_t end, const char *name)
>>> -- 
>>> 2.43.0
>>>
Gregory Price Dec. 26, 2024, 7:25 p.m. UTC | #17
On Thu, Dec 12, 2024 at 05:01:53PM -0800, Alison Schofield wrote:
> BIOS labels a resource Soft Reserved and programs a region using
> that range. Later, the existing cxl path to destroy that region
> does not free up that Soft Reserved range. Users cannot create
> another region in it's place. Resource lost. We considered simply
> removing soft reserved resources on region teardown, and you can
> probably find a patches on lore doing just that.
> 
> But - the problem grew. Sometimes BIOS creates an SR that is not
> aligned with the region they go on to program. Stranded resources.
> That's where the trim and give to DAX path originated.
> 
> But - the problem grew. Sometimes the CXL driver fails to enumerate
> that BIOS defined region. More stranded resources. Let's find those
> too and give them to DAX. This is something we are seeing in the
> wild now and why Dan raised its priority.
> 

Hm, this makes me concerned for what happens on "full hotplug" (literal
physical removal/addition) of CXL devices - kind of like we've seen
proposed with E3.S form factor devices from a variety of vendors.

Like what happens in the following scenario (rhetorical question, I want
to test this with QEMU - but i'm on a plane right now and want to get
the experiment process down).

Boot: No CXL device is present

Post-boot: CXL device is physically hot-plugged
 - there won't be a resource registered, so I would presume the ACPI
   / EFI / CXL drivers would register one.

Event 1: CXL device is shutdown and removed
   - Is the resource deleted?  I would presume yes.
   - Is this true if the CXL device *was* present at boot time?

     If i'm following correctly ^ this is the present scenario? 

Lets assume the device was present at boot, and the resource is not
deleted.  Now we have a "stale resource"?

Event 2A: A new CXL device is added
   - Possibility 1: Same capacity - resource is reused?
   - Possibility 2: Lower capacity - resource is chopped up?
   - Possibility 3: Higher capacity - resource is... lost forever?
                    Fails to map? ???

Event 2B: A new CXL device is added on a different PCI dev id, then
          Event 2A occurs.
   - Is the "stale resource" reused here, or is a new one created?

I hadn't really considered the impact of hotplug on the iomem resource
blocks (soft) reserved at boot, but this is concerning.

I remember ~1.5 years ago I was prototyping with hotplug behavior in
QEMU and saw that it was possible to do runtime ACPI/PCI add/remove of
CXL devices - this worked.  But I didn't look at the effects on iomem
resources - now i'm wondering what happens if I try to hot-unplug a CXL
device that was present at boot.

This won't affect me for the immediate future, but if we're mucking
around in this space, might as well ask the question.  I presume we'll
find even worse corner cases here :D :| :[ :<

I do know servers with front-facing E3.S CXL devices intended for
hot-replace exist and are a real use-case. I have no idea how that
is supposed to work the presence of stale iomem resources.

> Dan is also suggesting that at that last event - failure to enumerate
> a BIOS defined region, we tear down the entire ACPI0017 toplogy
> and give everything to DAX.
> 
> What Dan called, "the minimum requirement": all Soft Reserved ranges
> end up as dax-devices sounds like the right guideline moving forward.
>

I guess devils in the details here.  I sense an implication that it's
possible for two distinct pieces of SR-providing hardware (HBM and CXL)
could end up concatonated into a single SR range? That would obviously
necessitate the need for chopping up an SR.  So this all makes sense.

But I don't disagree with the need for this, just concerned that we have
CXL-specific logic landing in mm/ and e820 code.

~Gregory
diff mbox series

Patch

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 4893d30ce438..cab82e9324a5 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1210,14 +1210,23 @@  static unsigned long __init ram_alignment(resource_size_t pos)
 
 void __init e820__reserve_resources_late(void)
 {
-	int i;
 	struct resource *res;
+	int i;
 
+	/*
+	 * Prior to inserting SOFT_RESERVED resources we want to check for an
+	 * intersection with potential CXL resources. Any SOFT_RESERVED resources
+	 * that do intersect a potential CXL resource are set aside so they
+	 * can be trimmed to accommodate CXL resource intersections and added to
+	 * the iomem resource tree after the CXL drivers have completed their
+	 * device probe.
+	 */
 	res = e820_res;
-	for (i = 0; i < e820_table->nr_entries; i++) {
-		if (!res->parent && res->end)
+	for (i = 0; i < e820_table->nr_entries; i++, res++) {
+		if (res->desc == IORES_DESC_SOFT_RESERVED)
+			insert_soft_reserve_resource(res);
+		else if (!res->parent && res->end)
 			insert_resource_expand_to_fit(&iomem_resource, res);
-		res++;
 	}
 
 	/*
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index 21ad5f242875..c458a6313b31 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -3226,6 +3226,12 @@  static int match_region_by_range(struct device *dev, void *data)
 	return rc;
 }
 
+static int insert_region_resource(struct resource *parent, struct resource *res)
+{
+	trim_soft_reserve_resources(res);
+	return insert_resource(parent, res);
+}
+
 /* Establish an empty region covering the given HPA range */
 static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
 					   struct cxl_endpoint_decoder *cxled)
@@ -3272,7 +3278,7 @@  static struct cxl_region *construct_region(struct cxl_root_decoder *cxlrd,
 
 	*res = DEFINE_RES_MEM_NAMED(hpa->start, range_len(hpa),
 				    dev_name(&cxlr->dev));
-	rc = insert_resource(cxlrd->res, res);
+	rc = insert_region_resource(cxlrd->res, res);
 	if (rc) {
 		/*
 		 * Platform-firmware may not have split resources like "System
diff --git a/drivers/cxl/port.c b/drivers/cxl/port.c
index d7d5d982ce69..4461f2a80d72 100644
--- a/drivers/cxl/port.c
+++ b/drivers/cxl/port.c
@@ -89,6 +89,20 @@  static int cxl_switch_port_probe(struct cxl_port *port)
 	return -ENXIO;
 }
 
+static void cxl_sr_update(struct work_struct *w)
+{
+	merge_soft_reserve_resources();
+}
+
+DECLARE_DELAYED_WORK(cxl_sr_work, cxl_sr_update);
+
+static void schedule_soft_reserve_update(void)
+{
+	int timeout = 5 * HZ;
+
+	mod_delayed_work(system_wq, &cxl_sr_work, timeout);
+}
+
 static int cxl_endpoint_port_probe(struct cxl_port *port)
 {
 	struct cxl_endpoint_dvsec_info info = { .port = port };
@@ -140,6 +154,7 @@  static int cxl_endpoint_port_probe(struct cxl_port *port)
 	 */
 	device_for_each_child(&port->dev, root, discover_region);
 
+	schedule_soft_reserve_update();
 	return 0;
 }
 
diff --git a/drivers/dax/hmem/device.c b/drivers/dax/hmem/device.c
index f9e1a76a04a9..c45791ad4858 100644
--- a/drivers/dax/hmem/device.c
+++ b/drivers/dax/hmem/device.c
@@ -4,6 +4,7 @@ 
 #include <linux/module.h>
 #include <linux/dax.h>
 #include <linux/mm.h>
+#include "hmem.h"
 
 static bool nohmem;
 module_param_named(disable, nohmem, bool, 0444);
@@ -17,6 +18,9 @@  static struct resource hmem_active = {
 	.flags = IORESOURCE_MEM,
 };
 
+struct platform_device *hmem_pdev;
+EXPORT_SYMBOL_GPL(hmem_pdev);
+
 int walk_hmem_resources(struct device *host, walk_hmem_fn fn)
 {
 	struct resource *res;
@@ -35,7 +39,6 @@  EXPORT_SYMBOL_GPL(walk_hmem_resources);
 
 static void __hmem_register_resource(int target_nid, struct resource *res)
 {
-	struct platform_device *pdev;
 	struct resource *new;
 	int rc;
 
@@ -51,15 +54,15 @@  static void __hmem_register_resource(int target_nid, struct resource *res)
 	if (platform_initialized)
 		return;
 
-	pdev = platform_device_alloc("hmem_platform", 0);
-	if (!pdev) {
+	hmem_pdev = platform_device_alloc("hmem_platform", 0);
+	if (!hmem_pdev) {
 		pr_err_once("failed to register device-dax hmem_platform device\n");
 		return;
 	}
 
-	rc = platform_device_add(pdev);
+	rc = platform_device_add(hmem_pdev);
 	if (rc)
-		platform_device_put(pdev);
+		platform_device_put(hmem_pdev);
 	else
 		platform_initialized = true;
 }
diff --git a/drivers/dax/hmem/hmem.c b/drivers/dax/hmem/hmem.c
index 5e7c53f18491..d626b60a9716 100644
--- a/drivers/dax/hmem/hmem.c
+++ b/drivers/dax/hmem/hmem.c
@@ -5,6 +5,7 @@ 
 #include <linux/pfn_t.h>
 #include <linux/dax.h>
 #include "../bus.h"
+#include "hmem.h"
 
 static bool region_idle;
 module_param_named(region_idle, region_idle, bool, 0644);
@@ -123,8 +124,22 @@  static int hmem_register_device(struct device *host, int target_nid,
 	return rc;
 }
 
+static int dax_hmem_cb(struct notifier_block *nb, unsigned long action,
+		       void *arg)
+{
+	struct resource *res = arg;
+
+	return hmem_register_device(&hmem_pdev->dev,
+				    phys_to_target_node(res->start), res);
+}
+
+static struct notifier_block hmem_nb = {
+	.notifier_call = dax_hmem_cb
+};
+
 static int dax_hmem_platform_probe(struct platform_device *pdev)
 {
+	register_soft_reserve_notifier(&hmem_nb);
 	return walk_hmem_resources(&pdev->dev, hmem_register_device);
 }
 
diff --git a/drivers/dax/hmem/hmem.h b/drivers/dax/hmem/hmem.h
new file mode 100644
index 000000000000..95583b59cef7
--- /dev/null
+++ b/drivers/dax/hmem/hmem.h
@@ -0,0 +1,11 @@ 
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _HMEM_H
+#define _HMEM_H
+
+typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
+			    const struct resource *res);
+int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
+
+extern struct platform_device *hmem_pdev;
+
+#endif
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9d3e3327af4c..119b4e27a592 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -282,8 +282,4 @@  static inline void hmem_register_resource(int target_nid, struct resource *r)
 {
 }
 #endif
-
-typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
-			    const struct resource *res);
-int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
 #endif
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6e9fb667a1c5..487371a46392 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -14,6 +14,7 @@ 
 #include <linux/compiler.h>
 #include <linux/minmax.h>
 #include <linux/types.h>
+#include <linux/notifier.h>
 /*
  * Resources are tree-like, allowing
  * nesting etc..
@@ -249,6 +250,11 @@  struct resource *lookup_resource(struct resource *root, resource_size_t start);
 int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
 resource_size_t resource_alignment(struct resource *res);
+extern void trim_soft_reserve_resources(const struct resource *res);
+extern void merge_soft_reserve_resources(void);
+extern int insert_soft_reserve_resource(struct resource *res);
+extern int register_soft_reserve_notifier(struct notifier_block *nb);
+extern int unregister_soft_reserve_notifier(struct notifier_block *nb);
 static inline resource_size_t resource_size(const struct resource *res)
 {
 	return res->end - res->start + 1;
diff --git a/kernel/resource.c b/kernel/resource.c
index a83040fde236..8fc4121a1887 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -30,7 +30,7 @@ 
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <asm/io.h>
-
+#include <linux/acpi.h>
 
 struct resource ioport_resource = {
 	.name	= "PCI IO",
@@ -48,7 +48,15 @@  struct resource iomem_resource = {
 };
 EXPORT_SYMBOL(iomem_resource);
 
+struct resource srmem_resource = {
+	.name	= "Soft Reserved mem",
+	.start	= 0,
+	.end	= -1,
+	.flags	= IORESOURCE_MEM,
+};
+
 static DEFINE_RWLOCK(resource_lock);
+static DEFINE_RWLOCK(srmem_resource_lock);
 
 static struct resource *next_resource(struct resource *p, bool skip_children)
 {
@@ -1034,6 +1042,151 @@  int adjust_resource(struct resource *res, resource_size_t start,
 }
 EXPORT_SYMBOL(adjust_resource);
 
+static BLOCKING_NOTIFIER_HEAD(soft_reserve_chain);
+
+int register_soft_reserve_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_register(&soft_reserve_chain, nb);
+}
+EXPORT_SYMBOL(register_soft_reserve_notifier);
+
+int unregister_soft_reserve_notifier(struct notifier_block *nb)
+{
+	return blocking_notifier_chain_unregister(&soft_reserve_chain, nb);
+}
+EXPORT_SYMBOL(unregister_soft_reserve_notifier);
+
+static int soft_reserve_notify(unsigned long val, void *v)
+{
+	struct resource *res = v;
+
+	pr_info("Adding Soft Reserve resource %pr\n", res);
+	return blocking_notifier_call_chain(&soft_reserve_chain, val, v);
+}
+
+static void trim_soft_reserve(struct resource *sr_res,
+			      const struct resource *res)
+{
+	struct resource *new_res;
+
+	if (sr_res->start == res->start && sr_res->end == res->end) {
+		release_resource(sr_res);
+		free_resource(sr_res);
+	} else if (sr_res->start == res->start) {
+		WARN_ON(adjust_resource(sr_res, res->end + 1,
+					sr_res->end - res->end));
+	} else if (sr_res->end == res->end) {
+		WARN_ON(adjust_resource(sr_res, sr_res->start,
+					res->start - sr_res->start));
+	} else {
+		/*
+		 * Adjust existing resource to cover the resource
+		 * range prior to the range to be trimmed.
+		 */
+		adjust_resource(sr_res, sr_res->start,
+				res->start - sr_res->start);
+
+		/*
+		 * Add new resource to cover the resource range for
+		 * the range after the range to be trimmed.
+		 */
+		new_res = alloc_resource(GFP_KERNEL);
+		if (!new_res)
+			return;
+
+		*new_res = DEFINE_RES_NAMED(res->end + 1, sr_res->end - res->end,
+					    "Soft Reserved", sr_res->flags);
+		new_res->desc = IORES_DESC_SOFT_RESERVED;
+		insert_resource(&srmem_resource, new_res);
+	}
+}
+
+void trim_soft_reserve_resources(const struct resource *res)
+{
+	struct resource *sr_res;
+
+	write_lock(&srmem_resource_lock);
+	for (sr_res = srmem_resource.child; sr_res; sr_res = sr_res->sibling) {
+		if (resource_contains(sr_res, res)) {
+			trim_soft_reserve(sr_res, res);
+			break;
+		}
+	}
+	write_unlock(&srmem_resource_lock);
+}
+EXPORT_SYMBOL(trim_soft_reserve_resources);
+
+void merge_soft_reserve_resources(void)
+{
+	struct resource *sr_res, *next;
+
+	write_lock(&srmem_resource_lock);
+	for (sr_res = srmem_resource.child; sr_res; sr_res = next) {
+		next = sr_res->sibling;
+
+		release_resource(sr_res);
+		if (insert_resource(&iomem_resource, sr_res))
+			pr_info("Could not add Soft Reserve %pr\n", sr_res);
+		else
+			soft_reserve_notify(0, sr_res);
+	}
+	write_unlock(&srmem_resource_lock);
+}
+EXPORT_SYMBOL(merge_soft_reserve_resources);
+
+struct srmem_arg {
+	struct resource *res;
+	int overlaps;
+};
+
+static int srmem_parse_cfmws(union acpi_subtable_headers *hdr,
+			     void *arg, const unsigned long unused)
+{
+	struct acpi_cedt_cfmws *cfmws;
+	struct srmem_arg *args = arg;
+	struct resource cfmws_res;
+	struct resource *res;
+
+	res = args->res;
+
+	cfmws = (struct acpi_cedt_cfmws *)hdr;
+	cfmws_res = DEFINE_RES_MEM(cfmws->base_hpa,
+				   cfmws->base_hpa + cfmws->window_size);
+
+	if (resource_overlaps(&cfmws_res, res)) {
+		args->overlaps += 1;
+		return 1;
+	}
+
+	return 0;
+}
+
+static bool resource_overlaps_cfmws(struct resource *res)
+{
+	struct srmem_arg arg = {
+		.res = res,
+		.overlaps = 0
+	};
+
+	acpi_table_parse_cedt(ACPI_CEDT_TYPE_CFMWS, srmem_parse_cfmws, &arg);
+
+	if (arg.overlaps)
+		return true;
+
+	return false;
+}
+
+int insert_soft_reserve_resource(struct resource *res)
+{
+	if (resource_overlaps_cfmws(res)) {
+		pr_info("Reserving Soft Reserve %pr\n", res);
+		return insert_resource(&srmem_resource, res);
+	}
+
+	return insert_resource(&iomem_resource, res);
+}
+EXPORT_SYMBOL(insert_soft_reserve_resource);
+
 static void __init
 __reserve_region_with_split(struct resource *root, resource_size_t start,
 			    resource_size_t end, const char *name)