diff mbox series

[RFC] cxl/region: set numa node for target memdevs when a region is committed

Message ID 20250314164629.6937-1-nifan.cxl@gmail.com
State New
Headers show
Series [RFC] cxl/region: set numa node for target memdevs when a region is committed | expand

Commit Message

Fan Ni March 14, 2025, 4:40 p.m. UTC
From: Fan Ni <fan.ni@samsung.com>

There is a sysfs attribute named "numa_node" for cxl memory device.
however, it is never set so -1 is returned whenever it is read.

With this change, the numa_node of each target memdev is set based on the
start address of the hpa_range of the endpoint decoder it associated when a
cxl region is created; and it is reset when the region decoders are
reset.

Open qeustion: do we need to set the numa_node when the memdev is
probed instead of waiting until a region is created?

Signed-off-by: Fan Ni <fan.ni@samsung.com>
---
 drivers/cxl/core/region.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

Comments

Dave Jiang March 18, 2025, 9 p.m. UTC | #1
On 3/14/25 9:40 AM, nifan.cxl@gmail.com wrote:
> From: Fan Ni <fan.ni@samsung.com>
> 
> There is a sysfs attribute named "numa_node" for cxl memory device.
> however, it is never set so -1 is returned whenever it is read.
> 
> With this change, the numa_node of each target memdev is set based on the
> start address of the hpa_range of the endpoint decoder it associated when a
> cxl region is created; and it is reset when the region decoders are
> reset.
> 
> Open qeustion: do we need to set the numa_node when the memdev is
> probed instead of waiting until a region is created?

Typically, the numa node for a PCI device should be dev_to_node(), where the device resides. So when the device is probed, it should be set with that. See documentation [1]. Region should have its own NUMA node based on phys_to_target_node() of the starting address.  

[1]: https://elixir.bootlin.com/linux/v6.14-rc6/source/Documentation/ABI/testing/sysfs-bus-cxl#L85

DJ

> 
> Signed-off-by: Fan Ni <fan.ni@samsung.com>
> ---
>  drivers/cxl/core/region.c | 18 ++++++++++++++++++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index e8d11a988fd9..935ee0b1dd26 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -242,6 +242,13 @@ static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
>  	return 0;
>  }
>  
> +static void cxl_mem_reset_numa_node(struct cxl_endpoint_decoder *cxled)
> +{
> +	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> +
> +	cxlmd->dev.numa_node = NUMA_NO_NODE;
> +}
> +
>  static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
>  {
>  	struct cxl_region_params *p = &cxlr->params;
> @@ -264,6 +271,7 @@ static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
>  		if (cxlds->rcd)
>  			goto endpoint_reset;
>  
> +		cxl_mem_reset_numa_node(cxled);
>  		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
>  			iter = to_cxl_port(iter->dev.parent);
>  
> @@ -304,6 +312,15 @@ static int commit_decoder(struct cxl_decoder *cxld)
>  	return 0;
>  }
>  
> +static void cxl_mem_set_numa_node(struct cxl_endpoint_decoder *cxled)
> +{
> +	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> +	u64 addr = cxled->cxld.hpa_range.start;
> +
> +	cxlmd->dev.numa_node = phys_to_target_node(addr);
> +	dev_dbg(&cxlmd->dev, "set numa node: %d\n", phys_to_target_node(addr));
> +}
> +
>  static int cxl_region_decode_commit(struct cxl_region *cxlr)
>  {
>  	struct cxl_region_params *p = &cxlr->params;
> @@ -340,6 +357,7 @@ static int cxl_region_decode_commit(struct cxl_region *cxlr)
>  			cxled->cxld.reset(&cxled->cxld);
>  			goto err;
>  		}
> +		cxl_mem_set_numa_node(cxled);
>  	}
>  
>  	return 0;
Dan Williams March 18, 2025, 9:25 p.m. UTC | #2
Dave Jiang wrote:
> 
> 
> On 3/14/25 9:40 AM, nifan.cxl@gmail.com wrote:
> > From: Fan Ni <fan.ni@samsung.com>
> > 
> > There is a sysfs attribute named "numa_node" for cxl memory device.
> > however, it is never set so -1 is returned whenever it is read.
> > 
> > With this change, the numa_node of each target memdev is set based on the
> > start address of the hpa_range of the endpoint decoder it associated when a
> > cxl region is created; and it is reset when the region decoders are
> > reset.
> > 
> > Open qeustion: do we need to set the numa_node when the memdev is
> > probed instead of waiting until a region is created?
> 
> Typically, the numa node for a PCI device should be dev_to_node(),
> where the device resides. So when the device is probed, it should be
> set with that. See documentation [1]. Region should have its own NUMA
> node based on phys_to_target_node() of the starting address.  

Right, the memdev node is the affinity of device-MMIO to a CPU. The
HDM-memory that the device decodes may land in multiple proximity
domains and is subject to CDAT, CXL QoS, HMAT Generic Port, etc...

If your memdev node is "NUMA_NO_NODE" then that likely means the
affinity information for the PCI device is missing.

I would double check that first. See set_dev_node() in device_add().
Fan Ni March 18, 2025, 11:11 p.m. UTC | #3
On Tue, Mar 18, 2025 at 02:25:40PM -0700, Dan Williams wrote:
> Dave Jiang wrote:
> > 
> > 
> > On 3/14/25 9:40 AM, nifan.cxl@gmail.com wrote:
> > > From: Fan Ni <fan.ni@samsung.com>
> > > 
> > > There is a sysfs attribute named "numa_node" for cxl memory device.
> > > however, it is never set so -1 is returned whenever it is read.
> > > 
> > > With this change, the numa_node of each target memdev is set based on the
> > > start address of the hpa_range of the endpoint decoder it associated when a
> > > cxl region is created; and it is reset when the region decoders are
> > > reset.
> > > 
> > > Open qeustion: do we need to set the numa_node when the memdev is
> > > probed instead of waiting until a region is created?
> > 
> > Typically, the numa node for a PCI device should be dev_to_node(),
> > where the device resides. So when the device is probed, it should be
> > set with that. See documentation [1]. Region should have its own NUMA
> > node based on phys_to_target_node() of the starting address.  
> 
> Right, the memdev node is the affinity of device-MMIO to a CPU. The
> HDM-memory that the device decodes may land in multiple proximity
> domains and is subject to CDAT, CXL QoS, HMAT Generic Port, etc...
> 
> If your memdev node is "NUMA_NO_NODE" then that likely means the
> affinity information for the PCI device is missing.
> 
> I would double check that first. See set_dev_node() in device_add().

Thanks Dave and Dan for the explanation. 
Then the issue must be from qemu setup.

I added some debug code as below
---------------------------------------------
fan:~/cxl/linux-fixes$ git diff
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 5a1f05198114..c86a9eb58e99 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -3594,6 +3594,10 @@ int device_add(struct device *dev)
        if (kobj)
                dev->kobj.parent = kobj;
 
+        dev_dbg(dev, "device: '%s': %s XX node %d\n", dev_name(dev), __func__, dev_to_node(dev));
+        if (parent) {
+                dev_dbg(parent, "parent device: '%s': %s XX node %d\n", dev_name(parent), __func__, dev_to_node(parent));
+        }
        /* use parent numa_node */
        if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
                set_dev_node(dev, dev_to_node(parent));
---------------------------------------------

The output after loading cxl related drivers looks like below. All
numa_node is -1 in the cxl topology. 

Hi Jonathan,
   do I miss something in the qemu setup ??

qemu-system-x86_64 -s  -kernel bzImage -append "root=/dev/sda rw console=ttyS0,115200 ignore_loglevel nokaslr \
cxl_acpi.dyndbg=+fplm cxl_pci.dyndbg=+fplm cxl_core.dyndbg=+fplm cxl_mem.dyndbg=+fplm cxl_pmem.dyndbg=+fplm \
cxl_port.dyndbg=+fplm cxl_region.dyndbg=+fplm cxl_test.dyndbg=+fplm cxl_mock.dyndbg=+fplm \
cxl_mock_mem.dyndbg=+fplm dax.dyndbg=+fplm dax_cxl.dyndbg=+fplm device_dax.dyndbg=+fplm" \
-smp 8 -accel kvm -serial mon:stdio  -nographic  -qmp tcp:localhost:4445,server,wait=off \
-netdev user,id=network0,hostfwd=tcp::2024-:22 -device e1000,netdev=network0  -monitor telnet:127.0.0.1:12346,server,nowait \
-drive file=/home/fan/cxl/images/qemu-image.img,index=0,media=disk,format=raw -machine q35,cxl=on -cpu qemu64,mce=on \
-m 8G,maxmem=64G,slots=8  -virtfs local,path=/opt/lib/modules,mount_tag=modshare,security_model=mapped  \
-virtfs local,path=/home/fan,mount_tag=homeshare,security_model=mapped -object \
memory-backend-file,id=cxl-mem1,share=on,mem-path=/tmp/host//cxltest.raw,size=512M  \ 
-object memory-backend-file,id=cxl-lsa1,share=on,mem-path=/tmp/host//lsa.raw,size=1M \
-device pxb-cxl,bus_nr=12,bus=pcie.0,id=cxl.1,hdm_for_passthrough=true  \
-device cxl-rp,port=0,bus=cxl.1,id=root_port13,chassis=0,slot=2   \
-device cxl-type3,bus=root_port13,memdev=cxl-mem1,lsa=cxl-lsa1,id=cxl-pmem0,sn=0xabcd    \
-M cxl-fmw.0.targets.0=cxl.1,cxl-fmw.0.size=4G,cxl-fmw.0.interleave-granularity=8k

---------------------------------------------
fan:~/cxl/linux-fixes$ cat .config | grep CONFIG_NUMA
# CONFIG_NUMA_BALANCING is not set
CONFIG_NUMA=y
CONFIG_NUMA_KEEP_MEMINFO=y
CONFIG_NUMA_MEMBLKS=y
# CONFIG_NUMA_EMU is not set
fan:~/cxl/linux-fixes$ 

---------------------------------------------
root@debian:~# echo 'file core.c +p' >> /sys/kernel/debug/dynamic_debug/control
root@debian:~# dmesg | grep XX
root@debian:~# dmesg | grep XX
[   44.939510] wakeup wakeup14: device: 'wakeup14': device_add XX node -1
[   44.940195] acpi ACPI0017:00: parent device: 'ACPI0017:00': device_add XX node -1
[   44.941402] cxl root0: device: 'root0': device_add XX node -1
[   44.942023] cxl_acpi ACPI0017:00: parent device: 'ACPI0017:00': device_add XX node -1
[   44.947546] cxl decoder0.0: device: 'decoder0.0': device_add XX node -1
[   44.948219] cxl root0: parent device: 'root0': device_add XX node -1
[   44.958637] cxl port1: device: 'port1': device_add XX node -1
[   44.959245] cxl root0: parent device: 'root0': device_add XX node -1
[   44.990326] cxl decoder1.0: device: 'decoder1.0': device_add XX node -1
[   44.991014] cxl_port port1: parent device: 'port1': device_add XX node -1
[   44.993947] cxl decoder1.1: device: 'decoder1.1': device_add XX node -1
[   44.994593] cxl_port port1: parent device: 'port1': device_add XX node -1
[   44.997521] cxl decoder1.2: device: 'decoder1.2': device_add XX node -1
[   44.998203] cxl_port port1: parent device: 'port1': device_add XX node -1
[   45.001142] cxl decoder1.3: device: 'decoder1.3': device_add XX node -1
[   45.001821] cxl_port port1: parent device: 'port1': device_add XX node -1
[   45.005465] cxl nvdimm-bridge0: device: 'nvdimm-bridge0': device_add XX node -1
[   45.006206] cxl root0: parent device: 'root0': device_add XX node -1
[   45.072975] cxl mem0: device: 'mem0': device_add XX node -1
[   45.073519] cxl_pci 0000:0d:00.0: parent device: '0000:0d:00.0': device_add XX node -1
[   45.074937] firmware mem0: device: 'mem0': device_add XX node -1
[   45.075525] cxl mem0: parent device: 'mem0': device_add XX node -1
[   45.095409] nd ndbus0: device: 'ndbus0': device_add XX node -1
[   45.096135] cxl_nvdimm_bridge nvdimm-bridge0: parent device: 'nvdimm-bridge0': device_add XX node -1
[   45.097476] nd ndctl0: device: 'ndctl0': device_add XX node -1
[   45.099208] nd_bus ndbus0: parent device: 'ndbus0': device_add XX node -1
[   45.101286] cxl pmem0: device: 'pmem0': device_add XX node -1
[   45.102633] cxl_mem mem0: parent device: 'mem0': device_add XX node -1
[   45.108757] nd nmem0: device: 'nmem0': device_add XX node -1
[   45.109317] nd_bus ndbus0: parent device: 'ndbus0': device_add XX node -1
[   45.119846] cxl endpoint2: device: 'endpoint2': device_add XX node -1
[   45.120474] cxl_port port1: parent device: 'port1': device_add XX node -1
[   45.149351] cxl decoder2.0: device: 'decoder2.0': device_add XX node -1
[   45.150029] cxl_port endpoint2: parent device: 'endpoint2': device_add XX node -1
[   45.153057] cxl decoder2.1: device: 'decoder2.1': device_add XX node -1
[   45.153700] cxl_port endpoint2: parent device: 'endpoint2': device_add XX node -1
[   45.156723] cxl decoder2.2: device: 'decoder2.2': device_add XX node -1
[   45.157384] cxl_port endpoint2: parent device: 'endpoint2': device_add XX node -1
[   45.160407] cxl decoder2.3: device: 'decoder2.3': device_add XX node -1
[   45.161073] cxl_port endpoint2: parent device: 'endpoint2': device_add XX node -1
root@debian:~#
Dan Williams March 19, 2025, 12:16 a.m. UTC | #4
Fan Ni wrote:
> On Tue, Mar 18, 2025 at 02:25:40PM -0700, Dan Williams wrote:
> > Dave Jiang wrote:
> > > 
> > > 
> > > On 3/14/25 9:40 AM, nifan.cxl@gmail.com wrote:
> > > > From: Fan Ni <fan.ni@samsung.com>
> > > > 
> > > > There is a sysfs attribute named "numa_node" for cxl memory device.
> > > > however, it is never set so -1 is returned whenever it is read.
> > > > 
> > > > With this change, the numa_node of each target memdev is set based on the
> > > > start address of the hpa_range of the endpoint decoder it associated when a
> > > > cxl region is created; and it is reset when the region decoders are
> > > > reset.
> > > > 
> > > > Open qeustion: do we need to set the numa_node when the memdev is
> > > > probed instead of waiting until a region is created?
> > > 
> > > Typically, the numa node for a PCI device should be dev_to_node(),
> > > where the device resides. So when the device is probed, it should be
> > > set with that. See documentation [1]. Region should have its own NUMA
> > > node based on phys_to_target_node() of the starting address.  
> > 
> > Right, the memdev node is the affinity of device-MMIO to a CPU. The
> > HDM-memory that the device decodes may land in multiple proximity
> > domains and is subject to CDAT, CXL QoS, HMAT Generic Port, etc...
> > 
> > If your memdev node is "NUMA_NO_NODE" then that likely means the
> > affinity information for the PCI device is missing.
> > 
> > I would double check that first. See set_dev_node() in device_add().
> 
> Thanks Dave and Dan for the explanation. 
> Then the issue must be from qemu setup.
> 
> I added some debug code as below
> ---------------------------------------------
> fan:~/cxl/linux-fixes$ git diff
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index 5a1f05198114..c86a9eb58e99 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -3594,6 +3594,10 @@ int device_add(struct device *dev)
>         if (kobj)
>                 dev->kobj.parent = kobj;
>  
> +        dev_dbg(dev, "device: '%s': %s XX node %d\n", dev_name(dev), __func__, dev_to_node(dev));
> +        if (parent) {
> +                dev_dbg(parent, "parent device: '%s': %s XX node %d\n", dev_name(parent), __func__, dev_to_node(parent));
> +        }
>         /* use parent numa_node */
>         if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
>                 set_dev_node(dev, dev_to_node(parent));
> ---------------------------------------------
> 
> The output after loading cxl related drivers looks like below. All
> numa_node is -1 in the cxl topology. 
> 
> Hi Jonathan,
>    do I miss something in the qemu setup ??

IIUC the typical expectation for communicating the affinity of PCI
devices is an ACPI _PXM property for the host bridge object in the
[DS]SDT. As far as I can see QEMU does not build _PXM information for
its host bridges.
Jonathan Cameron March 21, 2025, 12:22 p.m. UTC | #5
On Tue, 18 Mar 2025 17:16:11 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Fan Ni wrote:
> > On Tue, Mar 18, 2025 at 02:25:40PM -0700, Dan Williams wrote:  
> > > Dave Jiang wrote:  
> > > > 
> > > > 
> > > > On 3/14/25 9:40 AM, nifan.cxl@gmail.com wrote:  
> > > > > From: Fan Ni <fan.ni@samsung.com>
> > > > > 
> > > > > There is a sysfs attribute named "numa_node" for cxl memory device.
> > > > > however, it is never set so -1 is returned whenever it is read.
> > > > > 
> > > > > With this change, the numa_node of each target memdev is set based on the
> > > > > start address of the hpa_range of the endpoint decoder it associated when a
> > > > > cxl region is created; and it is reset when the region decoders are
> > > > > reset.
> > > > > 
> > > > > Open qeustion: do we need to set the numa_node when the memdev is
> > > > > probed instead of waiting until a region is created?  
> > > > 
> > > > Typically, the numa node for a PCI device should be dev_to_node(),
> > > > where the device resides. So when the device is probed, it should be
> > > > set with that. See documentation [1]. Region should have its own NUMA
> > > > node based on phys_to_target_node() of the starting address.    
> > > 
> > > Right, the memdev node is the affinity of device-MMIO to a CPU. The
> > > HDM-memory that the device decodes may land in multiple proximity
> > > domains and is subject to CDAT, CXL QoS, HMAT Generic Port, etc...
> > > 
> > > If your memdev node is "NUMA_NO_NODE" then that likely means the
> > > affinity information for the PCI device is missing.
> > > 
> > > I would double check that first. See set_dev_node() in device_add().  
> > 
> > Thanks Dave and Dan for the explanation. 
> > Then the issue must be from qemu setup.
> > 
> > I added some debug code as below
> > ---------------------------------------------
> > fan:~/cxl/linux-fixes$ git diff
> > diff --git a/drivers/base/core.c b/drivers/base/core.c
> > index 5a1f05198114..c86a9eb58e99 100644
> > --- a/drivers/base/core.c
> > +++ b/drivers/base/core.c
> > @@ -3594,6 +3594,10 @@ int device_add(struct device *dev)
> >         if (kobj)
> >                 dev->kobj.parent = kobj;
> >  
> > +        dev_dbg(dev, "device: '%s': %s XX node %d\n", dev_name(dev), __func__, dev_to_node(dev));
> > +        if (parent) {
> > +                dev_dbg(parent, "parent device: '%s': %s XX node %d\n", dev_name(parent), __func__, dev_to_node(parent));
> > +        }
> >         /* use parent numa_node */
> >         if (parent && (dev_to_node(dev) == NUMA_NO_NODE))
> >                 set_dev_node(dev, dev_to_node(parent));
> > ---------------------------------------------
> > 
> > The output after loading cxl related drivers looks like below. All
> > numa_node is -1 in the cxl topology. 
> > 
> > Hi Jonathan,
> >    do I miss something in the qemu setup ??  
> 
> IIUC the typical expectation for communicating the affinity of PCI
> devices is an ACPI _PXM property for the host bridge object in the
> [DS]SDT. As far as I can see QEMU does not build _PXM information for
> its host bridges.
> 
First a side note.  _PXM on device is in theory also an option, but
long ago the 'fix' for that was reverted due to some really broken old
AMD platforms that put devices in non existent nodes. Hmm. I should
revisit that as I 'think' all the allocation with broken numa nodes
is long fixed (included an ACPI spec clarification so took a while!)
https://lore.kernel.org/linux-pci/20181211094737.71554-1-Jonathan.Cameron@huawei.com/

As for _PXM on host bridges, the gpex ACPI code does assign them for
PCI Expander Bridges, if you pass in the node
https://elixir.bootlin.com/qemu/v9.2.2/source/hw/pci-host/gpex-acpi.c#L178
(So we are good on ARM :)
https://elixir.bootlin.com/qemu/v9.2.2/source/hw/i386/acpi-build.c#L1533
does the same on x86. 

Those go via some indirections to a callback here:
https://elixir.bootlin.com/qemu/v9.2.2/source/hw/pci-bridge/pci_expander_bridge.c#L80

So set numa_node=X for each of your PXB instances.

Jonathan
diff mbox series

Patch

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e8d11a988fd9..935ee0b1dd26 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -242,6 +242,13 @@  static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 	return 0;
 }
 
+static void cxl_mem_reset_numa_node(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+
+	cxlmd->dev.numa_node = NUMA_NO_NODE;
+}
+
 static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
 {
 	struct cxl_region_params *p = &cxlr->params;
@@ -264,6 +271,7 @@  static void cxl_region_decode_reset(struct cxl_region *cxlr, int count)
 		if (cxlds->rcd)
 			goto endpoint_reset;
 
+		cxl_mem_reset_numa_node(cxled);
 		while (!is_cxl_root(to_cxl_port(iter->dev.parent)))
 			iter = to_cxl_port(iter->dev.parent);
 
@@ -304,6 +312,15 @@  static int commit_decoder(struct cxl_decoder *cxld)
 	return 0;
 }
 
+static void cxl_mem_set_numa_node(struct cxl_endpoint_decoder *cxled)
+{
+	struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
+	u64 addr = cxled->cxld.hpa_range.start;
+
+	cxlmd->dev.numa_node = phys_to_target_node(addr);
+	dev_dbg(&cxlmd->dev, "set numa node: %d\n", phys_to_target_node(addr));
+}
+
 static int cxl_region_decode_commit(struct cxl_region *cxlr)
 {
 	struct cxl_region_params *p = &cxlr->params;
@@ -340,6 +357,7 @@  static int cxl_region_decode_commit(struct cxl_region *cxlr)
 			cxled->cxld.reset(&cxled->cxld);
 			goto err;
 		}
+		cxl_mem_set_numa_node(cxled);
 	}
 
 	return 0;