[RFC,4/5] acpi/hmat: Register special purpose memory as a device
diff mbox series

Message ID 155440492988.3190322.4475460421334178449.stgit@dwillia2-desk3.amr.corp.intel.com
State New
Headers show
Series
  • EFI Special Purpose Memory Support
Related show

Commit Message

Dan Williams April 4, 2019, 7:08 p.m. UTC
Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
properties described by the ACPI HMAT is expected to have an application
specific consumer.

Those consumers may want 100% of the memory capacity to be reserved from
any usage by the kernel. By default, with this enabling, a platform
device is created to represent this differentiated resource.

A follow on change arranges for device-dax to claim these devices by
default and provide an mmap interface for the target application.
However, if the administrator prefers that some or all of the special
purpose memory is made available to the core-mm the device-dax hotplug
facility can be used to online the memory with its own numa node.

Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Len Brown <lenb@kernel.org>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/acpi/hmat/Kconfig |    1 +
 drivers/acpi/hmat/hmat.c  |   63 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/memregion.h |    3 ++
 3 files changed, 67 insertions(+)

Comments

Jonathan Cameron April 5, 2019, 11:18 a.m. UTC | #1
On Thu, 4 Apr 2019 12:08:49 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> properties described by the ACPI HMAT is expected to have an application
> specific consumer.
> 
> Those consumers may want 100% of the memory capacity to be reserved from
> any usage by the kernel. By default, with this enabling, a platform
> device is created to represent this differentiated resource.
> 
> A follow on change arranges for device-dax to claim these devices by
> default and provide an mmap interface for the target application.
> However, if the administrator prefers that some or all of the special
> purpose memory is made available to the core-mm the device-dax hotplug
> facility can be used to online the memory with its own numa node.
> 
> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> Cc: Len Brown <lenb@kernel.org>
> Cc: Keith Busch <keith.busch@intel.com>
> Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> Signed-off-by: Dan Williams <dan.j.williams@intel.com>

Hi Dan,

Great to see you getting this discussion going so fast and in
general the approach makes sense to me.

I'm a little confused why HMAT has anything to do with this.
SPM is defined either via the attribute in SRAT SPA entries,
EF_MEMORY_SP or via the EFI memory map.

Whether it is in HMAT or not isn't all that relevant.
Back in the days of the reservation hint (so before yesterday :)
it was relevant obviously but that's no longer true.

So what am I missing?

Thanks,

Jonathan


> ---
>  drivers/acpi/hmat/Kconfig |    1 +
>  drivers/acpi/hmat/hmat.c  |   63 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/memregion.h |    3 ++
>  3 files changed, 67 insertions(+)
> 
> diff --git a/drivers/acpi/hmat/Kconfig b/drivers/acpi/hmat/Kconfig
> index 95a29964dbea..4fcf76e8aa1d 100644
> --- a/drivers/acpi/hmat/Kconfig
> +++ b/drivers/acpi/hmat/Kconfig
> @@ -3,6 +3,7 @@ config ACPI_HMAT
>  	bool "ACPI Heterogeneous Memory Attribute Table Support"
>  	depends on ACPI_NUMA
>  	select HMEM_REPORTING
> +	select MEMREGION
>  	help
>  	 If set, this option has the kernel parse and report the
>  	 platform's ACPI HMAT (Heterogeneous Memory Attributes Table),
> diff --git a/drivers/acpi/hmat/hmat.c b/drivers/acpi/hmat/hmat.c
> index e7ae44c8d359..482360004ea0 100644
> --- a/drivers/acpi/hmat/hmat.c
> +++ b/drivers/acpi/hmat/hmat.c
> @@ -13,6 +13,9 @@
>  #include <linux/device.h>
>  #include <linux/init.h>
>  #include <linux/list.h>
> +#include <linux/mm.h>
> +#include <linux/memregion.h>
> +#include <linux/platform_device.h>
>  #include <linux/list_sort.h>
>  #include <linux/node.h>
>  #include <linux/sysfs.h>
> @@ -612,6 +615,65 @@ static __init void hmat_register_target_perf(struct memory_target *target)
>  	node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
>  }
>  
> +static __init void hmat_register_target_device(struct memory_target *target)
> +{
> +	struct memregion_info info;
> +	struct resource res = {
> +		.start = target->start,
> +		.end = target->start + target->size - 1,
> +		.flags = IORESOURCE_MEM,
> +		.desc = IORES_DESC_APPLICATION_RESERVED,
> +	};
> +	struct platform_device *pdev;
> +	int rc, id;
> +
> +	if (region_intersects(target->start, target->size, IORESOURCE_MEM,
> +				IORES_DESC_APPLICATION_RESERVED)
> +			!= REGION_INTERSECTS)
> +		return;
> +
> +	id = memregion_alloc();
> +	if (id < 0) {
> +		pr_err("acpi/hmat: memregion allocation failure for %pr\n", &res);
> +		return;
> +	}
> +
> +	pdev = platform_device_alloc("hmem", id);
> +	if (!pdev) {
> +		pr_err("acpi/hmat: hmem device allocation failure for %pr\n", &res);
> +		goto out_pdev;
> +	}
> +
> +	pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->processor_pxm);
> +	info = (struct memregion_info) {
> +		.target_node = acpi_map_pxm_to_node(target->memory_pxm),
> +	};
> +	rc = platform_device_add_data(pdev, &info, sizeof(info));
> +	if (rc < 0) {
> +		pr_err("acpi/hmat: hmem memregion_info allocation failure for %pr\n", &res);
> +		goto out_pdev;
> +	}
> +
> +	rc = platform_device_add_resources(pdev, &res, 1);
> +	if (rc < 0) {
> +		pr_err("acpi/hmat: hmem resource allocation failure for %pr\n", &res);
> +		goto out_resource;
> +	}
> +
> +	rc = platform_device_add(pdev);
> +	if (rc < 0) {
> +		dev_err(&pdev->dev, "acpi/hmat: device add failed for %pr\n", &res);
> +		goto out_resource;
> +	}
> +
> +	return;
> +
> +out_resource:
> +	put_device(&pdev->dev);
> +out_pdev:
> +	memregion_free(id);
> +}
> +
>  static __init void hmat_register_targets(void)
>  {
>  	struct memory_target *target;
> @@ -619,6 +681,7 @@ static __init void hmat_register_targets(void)
>  	list_for_each_entry(target, &targets, node) {
>  		hmat_register_target_initiators(target);
>  		hmat_register_target_perf(target);
> +		hmat_register_target_device(target);
>  	}
>  }
>  
> diff --git a/include/linux/memregion.h b/include/linux/memregion.h
> index 99fa47793b49..5de2ac7fcf5e 100644
> --- a/include/linux/memregion.h
> +++ b/include/linux/memregion.h
> @@ -1,6 +1,9 @@
>  // SPDX-License-Identifier: GPL-2.0
>  #ifndef _MEMREGION_H_
>  #define _MEMREGION_H_
> +struct memregion_info {
> +	int target_node;
> +};
>  int memregion_alloc(void);
>  void memregion_free(int id);
>  #endif /* _MEMREGION_H_ */
>
Dan Williams April 5, 2019, 3:43 p.m. UTC | #2
On Fri, Apr 5, 2019 at 4:19 AM Jonathan Cameron
<jonathan.cameron@huawei.com> wrote:
>
> On Thu, 4 Apr 2019 12:08:49 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> > properties described by the ACPI HMAT is expected to have an application
> > specific consumer.
> >
> > Those consumers may want 100% of the memory capacity to be reserved from
> > any usage by the kernel. By default, with this enabling, a platform
> > device is created to represent this differentiated resource.
> >
> > A follow on change arranges for device-dax to claim these devices by
> > default and provide an mmap interface for the target application.
> > However, if the administrator prefers that some or all of the special
> > purpose memory is made available to the core-mm the device-dax hotplug
> > facility can be used to online the memory with its own numa node.
> >
> > Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> > Cc: Len Brown <lenb@kernel.org>
> > Cc: Keith Busch <keith.busch@intel.com>
> > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
>
> Hi Dan,
>
> Great to see you getting this discussion going so fast and in
> general the approach makes sense to me.
>
> I'm a little confused why HMAT has anything to do with this.
> SPM is defined either via the attribute in SRAT SPA entries,
> EF_MEMORY_SP or via the EFI memory map.
>
> Whether it is in HMAT or not isn't all that relevant.
> Back in the days of the reservation hint (so before yesterday :)
> it was relevant obviously but that's no longer true.
>
> So what am I missing?

It's a good question, and an assumption I should have explicitly
declared in the changelog. The problem with EFI_MEMORY_SP is the same
as the problem with the EfiPersistentMemory type, it isn't precise
enough on its own for the kernel to delineate 'type' or
device/replaceable-unit boundaries. For example, I expect one
EFI_MEMORY_SP range of a specific type may be contiguous with another
range of a different type. Similar to the NFIT there is no requirement
in the specification that platform firmware inject multiple range
entries. Instead that precision is left to the SRAT + HMAT, or the
NFIT in the case of PMEM.

Conversely, and thinking through this a bit more, if a memory range is
"special", but the platform fails to enumerate it in HMAT I think
Linux should scream loudly that the firmware is broken and leave the
range alone. The "scream loudly" piece is missing in the current set,
but the "leave the range alone" functionality is included.
Jonathan Cameron April 5, 2019, 4:23 p.m. UTC | #3
On Fri, 5 Apr 2019 08:43:03 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Fri, Apr 5, 2019 at 4:19 AM Jonathan Cameron
> <jonathan.cameron@huawei.com> wrote:
> >
> > On Thu, 4 Apr 2019 12:08:49 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> > > properties described by the ACPI HMAT is expected to have an application
> > > specific consumer.
> > >
> > > Those consumers may want 100% of the memory capacity to be reserved from
> > > any usage by the kernel. By default, with this enabling, a platform
> > > device is created to represent this differentiated resource.
> > >
> > > A follow on change arranges for device-dax to claim these devices by
> > > default and provide an mmap interface for the target application.
> > > However, if the administrator prefers that some or all of the special
> > > purpose memory is made available to the core-mm the device-dax hotplug
> > > facility can be used to online the memory with its own numa node.
> > >
> > > Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> > > Cc: Len Brown <lenb@kernel.org>
> > > Cc: Keith Busch <keith.busch@intel.com>
> > > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> >
> > Hi Dan,
> >
> > Great to see you getting this discussion going so fast and in
> > general the approach makes sense to me.
> >
> > I'm a little confused why HMAT has anything to do with this.
> > SPM is defined either via the attribute in SRAT SPA entries,
> > EF_MEMORY_SP or via the EFI memory map.
> >
> > Whether it is in HMAT or not isn't all that relevant.
> > Back in the days of the reservation hint (so before yesterday :)
> > it was relevant obviously but that's no longer true.
> >
> > So what am I missing?  
> 
> It's a good question, and an assumption I should have explicitly
> declared in the changelog. The problem with EFI_MEMORY_SP is the same
> as the problem with the EfiPersistentMemory type, it isn't precise
> enough on its own for the kernel to delineate 'type' or
> device/replaceable-unit boundaries. For example, I expect one
> EFI_MEMORY_SP range of a specific type may be contiguous with another
> range of a different type. Similar to the NFIT there is no requirement
> in the specification that platform firmware inject multiple range
> entries. Instead that precision is left to the SRAT + HMAT, or the
> NFIT in the case of PMEM.

Absolutely, as long as they are all SPM, they could be anywhere in
the system.

> 
> Conversely, and thinking through this a bit more, if a memory range is
> "special", but the platform fails to enumerate it in HMAT I think
> Linux should scream loudly that the firmware is broken and leave the
> range alone. The "scream loudly" piece is missing in the current set,
> but the "leave the range alone" functionality is included.

I am certainly keen on screaming if the various entries are inconsistent
but am not sure they necessarily are here.

So there are a couple of ways we could get an SPM range defined.
The key thing here is that firmware should be attempting to describe
what it has to some degree somewhere.  If not it won't get a good
result ;)  So if there is no SRAT then you are on your own. SCREAM!

1. Directly in the memory map.  If there is no other information then
   tough luck the kernel can only sensibly handle it as one device.
   Or not at all, which seems like a reasonable decision to me.
   SCREAM

2. In memory map + a proximity domain entry in SRAT.  Given memory
   with different characteristics should be in different proximity
   domains anyway - this should be fairly precise. The slight snag
   here is that the fine grained nature of SRAT is actually a side
   effect of HMAT, so not sure well platforms have traditional
   describe their more subtle differences.

3. In NFIT as NFIT SPA carries the memory attribute.  Not sure if
   we should scream if this disagrees with the memory map.

4. In HMAT?  Now this changed in ACPI 6.3 to clean up the 'messy'
   prior relationship between it and SRAT.  Now HMAT no longer has
   memory address ranges as you observed.  That means, to describe
   properties of memory, it has to use the proximity domains of
   SRAT.  It provides lots of additional info about those domains
   but it is SRAT that defines them.

So I would argue that HMAT itself doesn't tell us anything useful.
SRAT certainly does though so I think this should be coming from
SRAT (or NFIT as that also defines the required precision)

Jonathan
Dan Williams April 5, 2019, 4:56 p.m. UTC | #4
On Fri, Apr 5, 2019 at 9:24 AM Jonathan Cameron
<jonathan.cameron@huawei.com> wrote:
>
> On Fri, 5 Apr 2019 08:43:03 -0700
> Dan Williams <dan.j.williams@intel.com> wrote:
>
> > On Fri, Apr 5, 2019 at 4:19 AM Jonathan Cameron
> > <jonathan.cameron@huawei.com> wrote:
> > >
> > > On Thu, 4 Apr 2019 12:08:49 -0700
> > > Dan Williams <dan.j.williams@intel.com> wrote:
> > >
> > > > Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> > > > properties described by the ACPI HMAT is expected to have an application
> > > > specific consumer.
> > > >
> > > > Those consumers may want 100% of the memory capacity to be reserved from
> > > > any usage by the kernel. By default, with this enabling, a platform
> > > > device is created to represent this differentiated resource.
> > > >
> > > > A follow on change arranges for device-dax to claim these devices by
> > > > default and provide an mmap interface for the target application.
> > > > However, if the administrator prefers that some or all of the special
> > > > purpose memory is made available to the core-mm the device-dax hotplug
> > > > facility can be used to online the memory with its own numa node.
> > > >
> > > > Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> > > > Cc: Len Brown <lenb@kernel.org>
> > > > Cc: Keith Busch <keith.busch@intel.com>
> > > > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>
> > >
> > > Hi Dan,
> > >
> > > Great to see you getting this discussion going so fast and in
> > > general the approach makes sense to me.
> > >
> > > I'm a little confused why HMAT has anything to do with this.
> > > SPM is defined either via the attribute in SRAT SPA entries,
> > > EF_MEMORY_SP or via the EFI memory map.
> > >
> > > Whether it is in HMAT or not isn't all that relevant.
> > > Back in the days of the reservation hint (so before yesterday :)
> > > it was relevant obviously but that's no longer true.
> > >
> > > So what am I missing?
> >
> > It's a good question, and an assumption I should have explicitly
> > declared in the changelog. The problem with EFI_MEMORY_SP is the same
> > as the problem with the EfiPersistentMemory type, it isn't precise
> > enough on its own for the kernel to delineate 'type' or
> > device/replaceable-unit boundaries. For example, I expect one
> > EFI_MEMORY_SP range of a specific type may be contiguous with another
> > range of a different type. Similar to the NFIT there is no requirement
> > in the specification that platform firmware inject multiple range
> > entries. Instead that precision is left to the SRAT + HMAT, or the
> > NFIT in the case of PMEM.
>
> Absolutely, as long as they are all SPM, they could be anywhere in
> the system.
>
> >
> > Conversely, and thinking through this a bit more, if a memory range is
> > "special", but the platform fails to enumerate it in HMAT I think
> > Linux should scream loudly that the firmware is broken and leave the
> > range alone. The "scream loudly" piece is missing in the current set,
> > but the "leave the range alone" functionality is included.
>
> I am certainly keen on screaming if the various entries are inconsistent
> but am not sure they necessarily are here.
>
> So there are a couple of ways we could get an SPM range defined.
> The key thing here is that firmware should be attempting to describe
> what it has to some degree somewhere.  If not it won't get a good
> result ;)  So if there is no SRAT then you are on your own. SCREAM!
>
> 1. Directly in the memory map.  If there is no other information then
>    tough luck the kernel can only sensibly handle it as one device.
>    Or not at all, which seems like a reasonable decision to me.
>    SCREAM
>
> 2. In memory map + a proximity domain entry in SRAT.  Given memory
>    with different characteristics should be in different proximity
>    domains anyway - this should be fairly precise. The slight snag
>    here is that the fine grained nature of SRAT is actually a side
>    effect of HMAT, so not sure well platforms have traditional
>    describe their more subtle differences.
>
> 3. In NFIT as NFIT SPA carries the memory attribute.  Not sure if
>    we should scream if this disagrees with the memory map.
>
> 4. In HMAT?  Now this changed in ACPI 6.3 to clean up the 'messy'
>    prior relationship between it and SRAT.  Now HMAT no longer has
>    memory address ranges as you observed.  That means, to describe
>    properties of memory, it has to use the proximity domains of
>    SRAT.  It provides lots of additional info about those domains
>    but it is SRAT that defines them.
>
> So I would argue that HMAT itself doesn't tell us anything useful.
> SRAT certainly does though so I think this should be coming from
> SRAT (or NFIT as that also defines the required precision)

I agree, yes, SRAT by itself is sufficient for this "precision"
concern. However, do we, core Linux developers, really want to
encourage platform vendors that they can ignore deploying HMAT data
and get Linux to honor that sub-case for EFI_MEMORY_SP? My personal
experience is that platform firmware will take advantage of almost any
opportunity to minimize the data it provides to the OS. The only hard
lever Linux has to encourage platform firmware to give complete data
is to decline to support configurations that have incomplete data.
Jonathan Cameron April 5, 2019, 5:39 p.m. UTC | #5
On Fri, 5 Apr 2019 09:56:22 -0700
Dan Williams <dan.j.williams@intel.com> wrote:

> On Fri, Apr 5, 2019 at 9:24 AM Jonathan Cameron
> <jonathan.cameron@huawei.com> wrote:
> >
> > On Fri, 5 Apr 2019 08:43:03 -0700
> > Dan Williams <dan.j.williams@intel.com> wrote:
> >  
> > > On Fri, Apr 5, 2019 at 4:19 AM Jonathan Cameron
> > > <jonathan.cameron@huawei.com> wrote:  
> > > >
> > > > On Thu, 4 Apr 2019 12:08:49 -0700
> > > > Dan Williams <dan.j.williams@intel.com> wrote:
> > > >  
> > > > > Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> > > > > properties described by the ACPI HMAT is expected to have an application
> > > > > specific consumer.
> > > > >
> > > > > Those consumers may want 100% of the memory capacity to be reserved from
> > > > > any usage by the kernel. By default, with this enabling, a platform
> > > > > device is created to represent this differentiated resource.
> > > > >
> > > > > A follow on change arranges for device-dax to claim these devices by
> > > > > default and provide an mmap interface for the target application.
> > > > > However, if the administrator prefers that some or all of the special
> > > > > purpose memory is made available to the core-mm the device-dax hotplug
> > > > > facility can be used to online the memory with its own numa node.
> > > > >
> > > > > Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
> > > > > Cc: Len Brown <lenb@kernel.org>
> > > > > Cc: Keith Busch <keith.busch@intel.com>
> > > > > Cc: Jonathan Cameron <Jonathan.Cameron@huawei.com>
> > > > > Signed-off-by: Dan Williams <dan.j.williams@intel.com>  
> > > >
> > > > Hi Dan,
> > > >
> > > > Great to see you getting this discussion going so fast and in
> > > > general the approach makes sense to me.
> > > >
> > > > I'm a little confused why HMAT has anything to do with this.
> > > > SPM is defined either via the attribute in SRAT SPA entries,
> > > > EF_MEMORY_SP or via the EFI memory map.
> > > >
> > > > Whether it is in HMAT or not isn't all that relevant.
> > > > Back in the days of the reservation hint (so before yesterday :)
> > > > it was relevant obviously but that's no longer true.
> > > >
> > > > So what am I missing?  
> > >
> > > It's a good question, and an assumption I should have explicitly
> > > declared in the changelog. The problem with EFI_MEMORY_SP is the same
> > > as the problem with the EfiPersistentMemory type, it isn't precise
> > > enough on its own for the kernel to delineate 'type' or
> > > device/replaceable-unit boundaries. For example, I expect one
> > > EFI_MEMORY_SP range of a specific type may be contiguous with another
> > > range of a different type. Similar to the NFIT there is no requirement
> > > in the specification that platform firmware inject multiple range
> > > entries. Instead that precision is left to the SRAT + HMAT, or the
> > > NFIT in the case of PMEM.  
> >
> > Absolutely, as long as they are all SPM, they could be anywhere in
> > the system.
> >  
> > >
> > > Conversely, and thinking through this a bit more, if a memory range is
> > > "special", but the platform fails to enumerate it in HMAT I think
> > > Linux should scream loudly that the firmware is broken and leave the
> > > range alone. The "scream loudly" piece is missing in the current set,
> > > but the "leave the range alone" functionality is included.  
> >
> > I am certainly keen on screaming if the various entries are inconsistent
> > but am not sure they necessarily are here.
> >
> > So there are a couple of ways we could get an SPM range defined.
> > The key thing here is that firmware should be attempting to describe
> > what it has to some degree somewhere.  If not it won't get a good
> > result ;)  So if there is no SRAT then you are on your own. SCREAM!
> >
> > 1. Directly in the memory map.  If there is no other information then
> >    tough luck the kernel can only sensibly handle it as one device.
> >    Or not at all, which seems like a reasonable decision to me.
> >    SCREAM
> >
> > 2. In memory map + a proximity domain entry in SRAT.  Given memory
> >    with different characteristics should be in different proximity
> >    domains anyway - this should be fairly precise. The slight snag
> >    here is that the fine grained nature of SRAT is actually a side
> >    effect of HMAT, so not sure well platforms have traditional
> >    describe their more subtle differences.
> >
> > 3. In NFIT as NFIT SPA carries the memory attribute.  Not sure if
> >    we should scream if this disagrees with the memory map.
> >
> > 4. In HMAT?  Now this changed in ACPI 6.3 to clean up the 'messy'
> >    prior relationship between it and SRAT.  Now HMAT no longer has
> >    memory address ranges as you observed.  That means, to describe
> >    properties of memory, it has to use the proximity domains of
> >    SRAT.  It provides lots of additional info about those domains
> >    but it is SRAT that defines them.
> >
> > So I would argue that HMAT itself doesn't tell us anything useful.
> > SRAT certainly does though so I think this should be coming from
> > SRAT (or NFIT as that also defines the required precision)  
> 
> I agree, yes, SRAT by itself is sufficient for this "precision"
> concern. However, do we, core Linux developers, really want to
> encourage platform vendors that they can ignore deploying HMAT data
> and get Linux to honor that sub-case for EFI_MEMORY_SP? My personal
> experience is that platform firmware will take advantage of almost any
> opportunity to minimize the data it provides to the OS. The only hard
> lever Linux has to encourage platform firmware to give complete data
> is to decline to support configurations that have incomplete data.
> 

If we decide as a community that this is the way we want to go, I'm
happy to politely point it out to our firmware people (who are a more
proactive group on detailed system descriptions than many!)

If we make this a clearly stated policy, perhaps via some comments
in the code or Documentation/ that that would be even better
and avoid people taking the 'but you could support my firmware'
line in the future.

I'll see if I can reach out to other OS vendors as well so we
can present a unified front on this (perhaps after a few days, just
in case we have any dissenting voices here!)

Thanks,

Jonathan
Christoph Hellwig April 9, 2019, 12:13 p.m. UTC | #6
On Thu, Apr 04, 2019 at 12:08:49PM -0700, Dan Williams wrote:
> Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> properties described by the ACPI HMAT is expected to have an application
> specific consumer.
> 
> Those consumers may want 100% of the memory capacity to be reserved from
> any usage by the kernel. By default, with this enabling, a platform
> device is created to represent this differentiated resource.

This sounds more than weird.  Since when did we let the firmware decide
who can use the memory?
Dan Williams April 9, 2019, 2:49 p.m. UTC | #7
On Tue, Apr 9, 2019 at 5:13 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Thu, Apr 04, 2019 at 12:08:49PM -0700, Dan Williams wrote:
> > Memory that has been tagged EFI_SPECIAL_PURPOSE, and has performance
> > properties described by the ACPI HMAT is expected to have an application
> > specific consumer.
> >
> > Those consumers may want 100% of the memory capacity to be reserved from
> > any usage by the kernel. By default, with this enabling, a platform
> > device is created to represent this differentiated resource.
>
> This sounds more than weird.  Since when did we let the firmware decide
> who can use the memory?

There's 2 related motivations for playing along with this "special
purpose" attribute. Before this bit we've seen gross hacks in platform
firmware trying to game OS behavior by lying about numa distances in
the ACPI SLIT. For example "near" high bandwidth memory being set at a
large distance to prevent the kernel from allocating from it by
default as much as possible. Secondly, allow niche applications
guarantees about being able to claim all of a given designated
resource.

The above comes with the option to override this default reservation
and just turn it back over to the page allocator i.e. ignore the
platform firmware hint.

The alternative is arranging for "special purpose" memory to be given
to the page allocator by default with the hope that it can be reserved
/ claimed early so the administrator can prevent unwanted allocations.
It just seemed overall easier to "default reserve with the option to
hot-add" instead of "default online with option / hope of hot-remove
or early allocation".

Patch
diff mbox series

diff --git a/drivers/acpi/hmat/Kconfig b/drivers/acpi/hmat/Kconfig
index 95a29964dbea..4fcf76e8aa1d 100644
--- a/drivers/acpi/hmat/Kconfig
+++ b/drivers/acpi/hmat/Kconfig
@@ -3,6 +3,7 @@  config ACPI_HMAT
 	bool "ACPI Heterogeneous Memory Attribute Table Support"
 	depends on ACPI_NUMA
 	select HMEM_REPORTING
+	select MEMREGION
 	help
 	 If set, this option has the kernel parse and report the
 	 platform's ACPI HMAT (Heterogeneous Memory Attributes Table),
diff --git a/drivers/acpi/hmat/hmat.c b/drivers/acpi/hmat/hmat.c
index e7ae44c8d359..482360004ea0 100644
--- a/drivers/acpi/hmat/hmat.c
+++ b/drivers/acpi/hmat/hmat.c
@@ -13,6 +13,9 @@ 
 #include <linux/device.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/memregion.h>
+#include <linux/platform_device.h>
 #include <linux/list_sort.h>
 #include <linux/node.h>
 #include <linux/sysfs.h>
@@ -612,6 +615,65 @@  static __init void hmat_register_target_perf(struct memory_target *target)
 	node_set_perf_attrs(mem_nid, &target->hmem_attrs, 0);
 }
 
+static __init void hmat_register_target_device(struct memory_target *target)
+{
+	struct memregion_info info;
+	struct resource res = {
+		.start = target->start,
+		.end = target->start + target->size - 1,
+		.flags = IORESOURCE_MEM,
+		.desc = IORES_DESC_APPLICATION_RESERVED,
+	};
+	struct platform_device *pdev;
+	int rc, id;
+
+	if (region_intersects(target->start, target->size, IORESOURCE_MEM,
+				IORES_DESC_APPLICATION_RESERVED)
+			!= REGION_INTERSECTS)
+		return;
+
+	id = memregion_alloc();
+	if (id < 0) {
+		pr_err("acpi/hmat: memregion allocation failure for %pr\n", &res);
+		return;
+	}
+
+	pdev = platform_device_alloc("hmem", id);
+	if (!pdev) {
+		pr_err("acpi/hmat: hmem device allocation failure for %pr\n", &res);
+		goto out_pdev;
+	}
+
+	pdev->dev.numa_node = acpi_map_pxm_to_online_node(target->processor_pxm);
+	info = (struct memregion_info) {
+		.target_node = acpi_map_pxm_to_node(target->memory_pxm),
+	};
+	rc = platform_device_add_data(pdev, &info, sizeof(info));
+	if (rc < 0) {
+		pr_err("acpi/hmat: hmem memregion_info allocation failure for %pr\n", &res);
+		goto out_pdev;
+	}
+
+	rc = platform_device_add_resources(pdev, &res, 1);
+	if (rc < 0) {
+		pr_err("acpi/hmat: hmem resource allocation failure for %pr\n", &res);
+		goto out_resource;
+	}
+
+	rc = platform_device_add(pdev);
+	if (rc < 0) {
+		dev_err(&pdev->dev, "acpi/hmat: device add failed for %pr\n", &res);
+		goto out_resource;
+	}
+
+	return;
+
+out_resource:
+	put_device(&pdev->dev);
+out_pdev:
+	memregion_free(id);
+}
+
 static __init void hmat_register_targets(void)
 {
 	struct memory_target *target;
@@ -619,6 +681,7 @@  static __init void hmat_register_targets(void)
 	list_for_each_entry(target, &targets, node) {
 		hmat_register_target_initiators(target);
 		hmat_register_target_perf(target);
+		hmat_register_target_device(target);
 	}
 }
 
diff --git a/include/linux/memregion.h b/include/linux/memregion.h
index 99fa47793b49..5de2ac7fcf5e 100644
--- a/include/linux/memregion.h
+++ b/include/linux/memregion.h
@@ -1,6 +1,9 @@ 
 // SPDX-License-Identifier: GPL-2.0
 #ifndef _MEMREGION_H_
 #define _MEMREGION_H_
+struct memregion_info {
+	int target_node;
+};
 int memregion_alloc(void);
 void memregion_free(int id);
 #endif /* _MEMREGION_H_ */