diff mbox

[v2] ACPI: Add sysfs links from memory device to memblocks

Message ID 1361826130-31062-1-git-send-email-toshi.kani@hp.com (mailing list archive)
State Rejected, archived
Headers show

Commit Message

Toshi Kani Feb. 25, 2013, 9:02 p.m. UTC
In order to eject a memory device object represented as "PNP0C80:%d"
in sysfs, its associated memblocks (system/memory/memory%d) need to
be off-lined.  However, there is no user friendly way to correlate
between a memory device object and its memblocks in sysfs.

This patch creates sysfs links to memblocks under a memory device
object so that a user can easily checks and manipulates its memblocks
in sysfs.

For example, when PNP0C80:05 is associated with memory8 and memory9,
the following two links are created under PNP0C80:05.  This allows
a user to access memory8/9 directly from PNP0C80:05.

  # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
  lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
  lrwxrwxrwx. memory9 -> ../../../system/memory/memory9

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
---

This patch applies on top of the Rafael's patch below.
https://patchwork.kernel.org/patch/2153261/

v2: Added a NULL return check for find_memory_block_hinted() as
pointed by Yasuaki Ishimatsu.

---
 drivers/acpi/acpi_memhotplug.c |   56 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Yasuaki Ishimatsu Feb. 26, 2013, 5:37 a.m. UTC | #1
2013/02/26 6:02, Toshi Kani wrote:
> In order to eject a memory device object represented as "PNP0C80:%d"
> in sysfs, its associated memblocks (system/memory/memory%d) need to
> be off-lined.  However, there is no user friendly way to correlate
> between a memory device object and its memblocks in sysfs.
> 
> This patch creates sysfs links to memblocks under a memory device
> object so that a user can easily checks and manipulates its memblocks
> in sysfs.
> 
> For example, when PNP0C80:05 is associated with memory8 and memory9,
> the following two links are created under PNP0C80:05.  This allows
> a user to access memory8/9 directly from PNP0C80:05.
> 
>    # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
>    lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
>    lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> 
> Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> ---

I confirmed that the patch has no problem.
Feel free to add:

Tested-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Acked-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

Thanks,
Yasuaki Ishimatsu

> 
> This patch applies on top of the Rafael's patch below.
> https://patchwork.kernel.org/patch/2153261/
> 
> v2: Added a NULL return check for find_memory_block_hinted() as
> pointed by Yasuaki Ishimatsu.
> 
> ---
>   drivers/acpi/acpi_memhotplug.c |   56 ++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 56 insertions(+)
> 
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index 3b3abbc..98477a5 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -28,6 +28,7 @@
>    */
>   
>   #include <linux/acpi.h>
> +#include <linux/memory.h>
>   #include <linux/memory_hotplug.h>
>   
>   #include "internal.h"
> @@ -168,6 +169,55 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
>   	return 0;
>   }
>   
> +static void acpi_setup_mem_blk_links(struct acpi_memory_device *mem_device,
> +		struct acpi_memory_info *info, bool add_links)
> +{
> +	struct memory_block *mem_blk = NULL;
> +	struct mem_section *mem_sect;
> +	unsigned long start_pfn, end_pfn, pfn;
> +	unsigned long section_nr;
> +	int ret;
> +
> +	start_pfn = PFN_DOWN(info->start_addr);
> +	end_pfn = PFN_UP(info->start_addr + info->length-1);
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> +		section_nr = pfn_to_section_nr(pfn);
> +
> +		if (!present_section_nr(section_nr))
> +			continue;
> +
> +		mem_sect = __nr_to_section(section_nr);
> +
> +		/* skip if the same memblock */
> +		if (mem_blk)
> +			if ((section_nr >= mem_blk->start_section_nr) &&
> +			    (section_nr <= mem_blk->end_section_nr))
> +				continue;
> +
> +		mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
> +		if (!mem_blk)
> +			continue;
> +
> +		if (add_links) {
> +			ret = sysfs_create_link_nowarn(
> +				&mem_device->device->dev.kobj,
> +				&mem_blk->dev.kobj,
> +				kobject_name(&mem_blk->dev.kobj));
> +			if (ret && ret != -EEXIST)
> +				dev_err(&mem_device->device->dev,
> +					"Failed to create sysfs link %s\n",
> +					kobject_name(&mem_blk->dev.kobj));
> +		} else {
> +			sysfs_remove_link(&mem_device->device->dev.kobj,
> +				kobject_name(&mem_blk->dev.kobj));
> +		}
> +	}
> +
> +	if (mem_blk)
> +		kobject_put(&mem_blk->dev.kobj);
> +}
> +
>   static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
>   {
>   	int result, num_enabled = 0;
> @@ -207,6 +257,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
>   			continue;
>   		}
>   
> +		/* Create sysfs links to its mem_blk devices */
> +		acpi_setup_mem_blk_links(mem_device, info, true);
> +
>   		if (!result)
>   			info->enabled = 1;
>   		/*
> @@ -241,6 +294,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
>   			/* The kernel does not use this memory block */
>   			continue;
>   
> +		/* Remove sysfs links to its mem_blk devices */
> +		acpi_setup_mem_blk_links(mem_device, info, false);
> +
>   		if (!info->enabled)
>   			/*
>   			 * The kernel uses this memory block, but it may be not
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Toshi Kani Feb. 26, 2013, 5:16 p.m. UTC | #2
On Tue, 2013-02-26 at 05:37 +0000, Yasuaki Ishimatsu wrote:
> 2013/02/26 6:02, Toshi Kani wrote:
> > In order to eject a memory device object represented as "PNP0C80:%d"
> > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > be off-lined.  However, there is no user friendly way to correlate
> > between a memory device object and its memblocks in sysfs.
> > 
> > This patch creates sysfs links to memblocks under a memory device
> > object so that a user can easily checks and manipulates its memblocks
> > in sysfs.
> > 
> > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > the following two links are created under PNP0C80:05.  This allows
> > a user to access memory8/9 directly from PNP0C80:05.
> > 
> >    # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> >    lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> >    lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > 
> > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > ---
> 
> I confirmed that the patch has no problem.
> Feel free to add:
> 
> Tested-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
> Acked-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>

Great!  Thanks Yasuaki!
-Toshi


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki March 26, 2013, 1:04 p.m. UTC | #3
On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> In order to eject a memory device object represented as "PNP0C80:%d"
> in sysfs, its associated memblocks (system/memory/memory%d) need to
> be off-lined.  However, there is no user friendly way to correlate
> between a memory device object and its memblocks in sysfs.
> 
> This patch creates sysfs links to memblocks under a memory device
> object so that a user can easily checks and manipulates its memblocks
> in sysfs.
> 
> For example, when PNP0C80:05 is associated with memory8 and memory9,
> the following two links are created under PNP0C80:05.  This allows
> a user to access memory8/9 directly from PNP0C80:05.
> 
>   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
>   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
>   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> 
> Signed-off-by: Toshi Kani <toshi.kani@hp.com>

Here I have some doubts.

This adds a very specific interface for user space that we're going to need to
maintain going forward if the user space starts to use it.  However, it kind of
duplicates the existing "physical_node" interface that we have for "regular"
devices.

So, if possible, I'd like the memory subsystem to utilize the existing
interface instead of creating an entirely new one.  Namely, why don't we create
a struct device-based object for each memory block and associated those new
"devices" with the PNP0C80 ACPI object through the functions in glue.c?
Then, we could add an "offline/online" interface to those "devices" too.

Thanks,
Rafael


> ---
> 
> This patch applies on top of the Rafael's patch below.
> https://patchwork.kernel.org/patch/2153261/
> 
> v2: Added a NULL return check for find_memory_block_hinted() as
> pointed by Yasuaki Ishimatsu.
> 
> ---
>  drivers/acpi/acpi_memhotplug.c |   56 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index 3b3abbc..98477a5 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -28,6 +28,7 @@
>   */
>  
>  #include <linux/acpi.h>
> +#include <linux/memory.h>
>  #include <linux/memory_hotplug.h>
>  
>  #include "internal.h"
> @@ -168,6 +169,55 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
>  	return 0;
>  }
>  
> +static void acpi_setup_mem_blk_links(struct acpi_memory_device *mem_device,
> +		struct acpi_memory_info *info, bool add_links)
> +{
> +	struct memory_block *mem_blk = NULL;
> +	struct mem_section *mem_sect;
> +	unsigned long start_pfn, end_pfn, pfn;
> +	unsigned long section_nr;
> +	int ret;
> +
> +	start_pfn = PFN_DOWN(info->start_addr);
> +	end_pfn = PFN_UP(info->start_addr + info->length-1);
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> +		section_nr = pfn_to_section_nr(pfn);
> +
> +		if (!present_section_nr(section_nr))
> +			continue;
> +
> +		mem_sect = __nr_to_section(section_nr);
> +
> +		/* skip if the same memblock */
> +		if (mem_blk)
> +			if ((section_nr >= mem_blk->start_section_nr) &&
> +			    (section_nr <= mem_blk->end_section_nr))
> +				continue;
> +
> +		mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
> +		if (!mem_blk)
> +			continue;
> +
> +		if (add_links) {
> +			ret = sysfs_create_link_nowarn(
> +				&mem_device->device->dev.kobj,
> +				&mem_blk->dev.kobj,
> +				kobject_name(&mem_blk->dev.kobj));
> +			if (ret && ret != -EEXIST)
> +				dev_err(&mem_device->device->dev,
> +					"Failed to create sysfs link %s\n",
> +					kobject_name(&mem_blk->dev.kobj));
> +		} else {
> +			sysfs_remove_link(&mem_device->device->dev.kobj,
> +				kobject_name(&mem_blk->dev.kobj));
> +		}
> +	}
> +
> +	if (mem_blk)
> +		kobject_put(&mem_blk->dev.kobj);
> +}
> +
>  static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
>  {
>  	int result, num_enabled = 0;
> @@ -207,6 +257,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
>  			continue;
>  		}
>  
> +		/* Create sysfs links to its mem_blk devices */
> +		acpi_setup_mem_blk_links(mem_device, info, true);
> +
>  		if (!result)
>  			info->enabled = 1;
>  		/*
> @@ -241,6 +294,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
>  			/* The kernel does not use this memory block */
>  			continue;
>  
> +		/* Remove sysfs links to its mem_blk devices */
> +		acpi_setup_mem_blk_links(mem_device, info, false);
> +
>  		if (!info->enabled)
>  			/*
>  			 * The kernel uses this memory block, but it may be not
> --
> To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Toshi Kani March 26, 2013, 3:42 p.m. UTC | #4
On Tue, 2013-03-26 at 14:04 +0100, Rafael J. Wysocki wrote:
> On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> > In order to eject a memory device object represented as "PNP0C80:%d"
> > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > be off-lined.  However, there is no user friendly way to correlate
> > between a memory device object and its memblocks in sysfs.
> > 
> > This patch creates sysfs links to memblocks under a memory device
> > object so that a user can easily checks and manipulates its memblocks
> > in sysfs.
> > 
> > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > the following two links are created under PNP0C80:05.  This allows
> > a user to access memory8/9 directly from PNP0C80:05.
> > 
> >   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> >   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> >   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > 
> > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> 
> Here I have some doubts.
> 
> This adds a very specific interface for user space that we're going to need to
> maintain going forward if the user space starts to use it.  However, it kind of
> duplicates the existing "physical_node" interface that we have for "regular"
> devices.
> 
> So, if possible, I'd like the memory subsystem to utilize the existing
> interface instead of creating an entirely new one.  Namely, why don't we create
> a struct device-based object for each memory block and associated those new
> "devices" with the PNP0C80 ACPI object through the functions in glue.c?
> Then, we could add an "offline/online" interface to those "devices" too.

This patch simply adds symbolic links to system/memory/memoryN, which
the memory subsystem already provides for the online/offline interface
of memory blocks.  So, it does not introduce a new interface, but guides
users (and user tools) to know which memory blocks need to be off-lined
in order to hot-delete any particular memory device PNP0C80:X.  A cpu
device LNXCPU:X also has a similar symbolic link "sysdev" that links to
system/cpu/cpuN.  I could not use the same "sysdev" for PNP0C80:X since
it typically associates with multiple memory blocks.

I thought about using glue.c to create symbolic links between memoryN
and PNP0C80:X.  However, it has an ordering issue.  During boot-time,
memoryN gets created before PNP0C80:X.  But during hot-add, PNP0C80:X
gets created before memoryN.  This patch calls
acpi_setup_mem_blk_links() in a point that solves this ordering issue
since this point guarantees that both memoryN and PNP0C80X are created
for both boot-time and hot-add.

Thanks,
-Toshi


> 
> Thanks,
> Rafael
> 
> 
> > ---
> > 
> > This patch applies on top of the Rafael's patch below.
> > https://patchwork.kernel.org/patch/2153261/
> > 
> > v2: Added a NULL return check for find_memory_block_hinted() as
> > pointed by Yasuaki Ishimatsu.
> > 
> > ---
> >  drivers/acpi/acpi_memhotplug.c |   56 ++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 56 insertions(+)
> > 
> > diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> > index 3b3abbc..98477a5 100644
> > --- a/drivers/acpi/acpi_memhotplug.c
> > +++ b/drivers/acpi/acpi_memhotplug.c
> > @@ -28,6 +28,7 @@
> >   */
> >  
> >  #include <linux/acpi.h>
> > +#include <linux/memory.h>
> >  #include <linux/memory_hotplug.h>
> >  
> >  #include "internal.h"
> > @@ -168,6 +169,55 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
> >  	return 0;
> >  }
> >  
> > +static void acpi_setup_mem_blk_links(struct acpi_memory_device *mem_device,
> > +		struct acpi_memory_info *info, bool add_links)
> > +{
> > +	struct memory_block *mem_blk = NULL;
> > +	struct mem_section *mem_sect;
> > +	unsigned long start_pfn, end_pfn, pfn;
> > +	unsigned long section_nr;
> > +	int ret;
> > +
> > +	start_pfn = PFN_DOWN(info->start_addr);
> > +	end_pfn = PFN_UP(info->start_addr + info->length-1);
> > +
> > +	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> > +		section_nr = pfn_to_section_nr(pfn);
> > +
> > +		if (!present_section_nr(section_nr))
> > +			continue;
> > +
> > +		mem_sect = __nr_to_section(section_nr);
> > +
> > +		/* skip if the same memblock */
> > +		if (mem_blk)
> > +			if ((section_nr >= mem_blk->start_section_nr) &&
> > +			    (section_nr <= mem_blk->end_section_nr))
> > +				continue;
> > +
> > +		mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
> > +		if (!mem_blk)
> > +			continue;
> > +
> > +		if (add_links) {
> > +			ret = sysfs_create_link_nowarn(
> > +				&mem_device->device->dev.kobj,
> > +				&mem_blk->dev.kobj,
> > +				kobject_name(&mem_blk->dev.kobj));
> > +			if (ret && ret != -EEXIST)
> > +				dev_err(&mem_device->device->dev,
> > +					"Failed to create sysfs link %s\n",
> > +					kobject_name(&mem_blk->dev.kobj));
> > +		} else {
> > +			sysfs_remove_link(&mem_device->device->dev.kobj,
> > +				kobject_name(&mem_blk->dev.kobj));
> > +		}
> > +	}
> > +
> > +	if (mem_blk)
> > +		kobject_put(&mem_blk->dev.kobj);
> > +}
> > +
> >  static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
> >  {
> >  	int result, num_enabled = 0;
> > @@ -207,6 +257,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
> >  			continue;
> >  		}
> >  
> > +		/* Create sysfs links to its mem_blk devices */
> > +		acpi_setup_mem_blk_links(mem_device, info, true);
> > +
> >  		if (!result)
> >  			info->enabled = 1;
> >  		/*
> > @@ -241,6 +294,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
> >  			/* The kernel does not use this memory block */
> >  			continue;
> >  
> > +		/* Remove sysfs links to its mem_blk devices */
> > +		acpi_setup_mem_blk_links(mem_device, info, false);
> > +
> >  		if (!info->enabled)
> >  			/*
> >  			 * The kernel uses this memory block, but it may be not
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki March 26, 2013, 10:39 p.m. UTC | #5
On Tuesday, March 26, 2013 09:42:06 AM Toshi Kani wrote:
> On Tue, 2013-03-26 at 14:04 +0100, Rafael J. Wysocki wrote:
> > On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> > > In order to eject a memory device object represented as "PNP0C80:%d"
> > > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > > be off-lined.  However, there is no user friendly way to correlate
> > > between a memory device object and its memblocks in sysfs.
> > > 
> > > This patch creates sysfs links to memblocks under a memory device
> > > object so that a user can easily checks and manipulates its memblocks
> > > in sysfs.
> > > 
> > > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > > the following two links are created under PNP0C80:05.  This allows
> > > a user to access memory8/9 directly from PNP0C80:05.
> > > 
> > >   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> > >   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> > >   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > > 
> > > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > 
> > Here I have some doubts.
> > 
> > This adds a very specific interface for user space that we're going to need to
> > maintain going forward if the user space starts to use it.  However, it kind of
> > duplicates the existing "physical_node" interface that we have for "regular"
> > devices.
> > 
> > So, if possible, I'd like the memory subsystem to utilize the existing
> > interface instead of creating an entirely new one.  Namely, why don't we create
> > a struct device-based object for each memory block and associated those new
> > "devices" with the PNP0C80 ACPI object through the functions in glue.c?
> > Then, we could add an "offline/online" interface to those "devices" too.
> 
> This patch simply adds symbolic links to system/memory/memoryN, which
> the memory subsystem already provides for the online/offline interface
> of memory blocks.  So, it does not introduce a new interface, but guides
> users (and user tools) to know which memory blocks need to be off-lined
> in order to hot-delete any particular memory device PNP0C80:X.  A cpu
> device LNXCPU:X also has a similar symbolic link "sysdev" that links to
> system/cpu/cpuN.  I could not use the same "sysdev" for PNP0C80:X since
> it typically associates with multiple memory blocks.
> 
> I thought about using glue.c to create symbolic links between memoryN
> and PNP0C80:X.  However, it has an ordering issue.  During boot-time,
> memoryN gets created before PNP0C80:X.  But during hot-add, PNP0C80:X
> gets created before memoryN.

Quite frankly, this sounds like a bug to me.  Namely, what is memoryN really
good for without PNP0C80:X?  If it is not good for anything in that case,
it should never be created befor PNP0C80:X.

> This patch calls
> acpi_setup_mem_blk_links() in a point that solves this ordering issue
> since this point guarantees that both memoryN and PNP0C80X are created
> for both boot-time and hot-add.

I would prefer the ordering of creation to be the same in both cases.
Otherwise it really looks like we need to work around a problem that we're
creating for ourselves.

How exactly are memoryN created during boot?

Rafael


> > > ---
> > > 
> > > This patch applies on top of the Rafael's patch below.
> > > https://patchwork.kernel.org/patch/2153261/
> > > 
> > > v2: Added a NULL return check for find_memory_block_hinted() as
> > > pointed by Yasuaki Ishimatsu.
> > > 
> > > ---
> > >  drivers/acpi/acpi_memhotplug.c |   56 ++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 56 insertions(+)
> > > 
> > > diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> > > index 3b3abbc..98477a5 100644
> > > --- a/drivers/acpi/acpi_memhotplug.c
> > > +++ b/drivers/acpi/acpi_memhotplug.c
> > > @@ -28,6 +28,7 @@
> > >   */
> > >  
> > >  #include <linux/acpi.h>
> > > +#include <linux/memory.h>
> > >  #include <linux/memory_hotplug.h>
> > >  
> > >  #include "internal.h"
> > > @@ -168,6 +169,55 @@ static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
> > >  	return 0;
> > >  }
> > >  
> > > +static void acpi_setup_mem_blk_links(struct acpi_memory_device *mem_device,
> > > +		struct acpi_memory_info *info, bool add_links)
> > > +{
> > > +	struct memory_block *mem_blk = NULL;
> > > +	struct mem_section *mem_sect;
> > > +	unsigned long start_pfn, end_pfn, pfn;
> > > +	unsigned long section_nr;
> > > +	int ret;
> > > +
> > > +	start_pfn = PFN_DOWN(info->start_addr);
> > > +	end_pfn = PFN_UP(info->start_addr + info->length-1);
> > > +
> > > +	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
> > > +		section_nr = pfn_to_section_nr(pfn);
> > > +
> > > +		if (!present_section_nr(section_nr))
> > > +			continue;
> > > +
> > > +		mem_sect = __nr_to_section(section_nr);
> > > +
> > > +		/* skip if the same memblock */
> > > +		if (mem_blk)
> > > +			if ((section_nr >= mem_blk->start_section_nr) &&
> > > +			    (section_nr <= mem_blk->end_section_nr))
> > > +				continue;
> > > +
> > > +		mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
> > > +		if (!mem_blk)
> > > +			continue;
> > > +
> > > +		if (add_links) {
> > > +			ret = sysfs_create_link_nowarn(
> > > +				&mem_device->device->dev.kobj,
> > > +				&mem_blk->dev.kobj,
> > > +				kobject_name(&mem_blk->dev.kobj));
> > > +			if (ret && ret != -EEXIST)
> > > +				dev_err(&mem_device->device->dev,
> > > +					"Failed to create sysfs link %s\n",
> > > +					kobject_name(&mem_blk->dev.kobj));
> > > +		} else {
> > > +			sysfs_remove_link(&mem_device->device->dev.kobj,
> > > +				kobject_name(&mem_blk->dev.kobj));
> > > +		}
> > > +	}
> > > +
> > > +	if (mem_blk)
> > > +		kobject_put(&mem_blk->dev.kobj);
> > > +}
> > > +
> > >  static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
> > >  {
> > >  	int result, num_enabled = 0;
> > > @@ -207,6 +257,9 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
> > >  			continue;
> > >  		}
> > >  
> > > +		/* Create sysfs links to its mem_blk devices */
> > > +		acpi_setup_mem_blk_links(mem_device, info, true);
> > > +
> > >  		if (!result)
> > >  			info->enabled = 1;
> > >  		/*
> > > @@ -241,6 +294,9 @@ static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
> > >  			/* The kernel does not use this memory block */
> > >  			continue;
> > >  
> > > +		/* Remove sysfs links to its mem_blk devices */
> > > +		acpi_setup_mem_blk_links(mem_device, info, false);
> > > +
> > >  		if (!info->enabled)
> > >  			/*
> > >  			 * The kernel uses this memory block, but it may be not
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
>
Toshi Kani March 26, 2013, 10:59 p.m. UTC | #6
On Tue, 2013-03-26 at 23:39 +0100, Rafael J. Wysocki wrote:
> On Tuesday, March 26, 2013 09:42:06 AM Toshi Kani wrote:
> > On Tue, 2013-03-26 at 14:04 +0100, Rafael J. Wysocki wrote:
> > > On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> > > > In order to eject a memory device object represented as "PNP0C80:%d"
> > > > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > > > be off-lined.  However, there is no user friendly way to correlate
> > > > between a memory device object and its memblocks in sysfs.
> > > > 
> > > > This patch creates sysfs links to memblocks under a memory device
> > > > object so that a user can easily checks and manipulates its memblocks
> > > > in sysfs.
> > > > 
> > > > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > > > the following two links are created under PNP0C80:05.  This allows
> > > > a user to access memory8/9 directly from PNP0C80:05.
> > > > 
> > > >   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> > > >   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> > > >   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > > > 
> > > > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > > 
> > > Here I have some doubts.
> > > 
> > > This adds a very specific interface for user space that we're going to need to
> > > maintain going forward if the user space starts to use it.  However, it kind of
> > > duplicates the existing "physical_node" interface that we have for "regular"
> > > devices.
> > > 
> > > So, if possible, I'd like the memory subsystem to utilize the existing
> > > interface instead of creating an entirely new one.  Namely, why don't we create
> > > a struct device-based object for each memory block and associated those new
> > > "devices" with the PNP0C80 ACPI object through the functions in glue.c?
> > > Then, we could add an "offline/online" interface to those "devices" too.
> > 
> > This patch simply adds symbolic links to system/memory/memoryN, which
> > the memory subsystem already provides for the online/offline interface
> > of memory blocks.  So, it does not introduce a new interface, but guides
> > users (and user tools) to know which memory blocks need to be off-lined
> > in order to hot-delete any particular memory device PNP0C80:X.  A cpu
> > device LNXCPU:X also has a similar symbolic link "sysdev" that links to
> > system/cpu/cpuN.  I could not use the same "sysdev" for PNP0C80:X since
> > it typically associates with multiple memory blocks.
> > 
> > I thought about using glue.c to create symbolic links between memoryN
> > and PNP0C80:X.  However, it has an ordering issue.  During boot-time,
> > memoryN gets created before PNP0C80:X.  But during hot-add, PNP0C80:X
> > gets created before memoryN.
> 
> Quite frankly, this sounds like a bug to me.  Namely, what is memoryN really
> good for without PNP0C80:X?  If it is not good for anything in that case,
> it should never be created befor PNP0C80:X.

memoryN works without PNP0C80:X and does not depend on ACPI.  A memoryN
represents a memblk, each of which is merely a 128MB (in case of x86) of
memory chunk sliced from the entire memory ranges.  This is why it is
hard to associate between memoryN and PNP0C80:X without these symbolic
links (otherwise, you will have to calculate from memory address.)

The memory subsystem also obtains the memory ranges from EFI or e820
during boot, and ACPI is not necessary to construct memoryN at boot.
Since EFI / e820 only provides the boot-time configuration, ACPI is used
to update the memory ranges during hot-add/delete.

> > This patch calls
> > acpi_setup_mem_blk_links() in a point that solves this ordering issue
> > since this point guarantees that both memoryN and PNP0C80X are created
> > for both boot-time and hot-add.
> 
> I would prefer the ordering of creation to be the same in both cases.
> Otherwise it really looks like we need to work around a problem that we're
> creating for ourselves.
> 
> How exactly are memoryN created during boot?

memoryN is created in memory_dev_init().  This is even before
do_initcalls().  I'd also prefer the same ordering, but I felt that
changing this ordering would be rather challenging.

do_basic_setup()
  driver_init()
    memory_dev_init()

Thanks,
-Toshi



--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki March 26, 2013, 11:39 p.m. UTC | #7
On Tuesday, March 26, 2013 04:59:36 PM Toshi Kani wrote:
> On Tue, 2013-03-26 at 23:39 +0100, Rafael J. Wysocki wrote:
> > On Tuesday, March 26, 2013 09:42:06 AM Toshi Kani wrote:
> > > On Tue, 2013-03-26 at 14:04 +0100, Rafael J. Wysocki wrote:
> > > > On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> > > > > In order to eject a memory device object represented as "PNP0C80:%d"
> > > > > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > > > > be off-lined.  However, there is no user friendly way to correlate
> > > > > between a memory device object and its memblocks in sysfs.
> > > > > 
> > > > > This patch creates sysfs links to memblocks under a memory device
> > > > > object so that a user can easily checks and manipulates its memblocks
> > > > > in sysfs.
> > > > > 
> > > > > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > > > > the following two links are created under PNP0C80:05.  This allows
> > > > > a user to access memory8/9 directly from PNP0C80:05.
> > > > > 
> > > > >   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> > > > >   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> > > > >   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > > > > 
> > > > > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > > > 
> > > > Here I have some doubts.
> > > > 
> > > > This adds a very specific interface for user space that we're going to need to
> > > > maintain going forward if the user space starts to use it.  However, it kind of
> > > > duplicates the existing "physical_node" interface that we have for "regular"
> > > > devices.
> > > > 
> > > > So, if possible, I'd like the memory subsystem to utilize the existing
> > > > interface instead of creating an entirely new one.  Namely, why don't we create
> > > > a struct device-based object for each memory block and associated those new
> > > > "devices" with the PNP0C80 ACPI object through the functions in glue.c?
> > > > Then, we could add an "offline/online" interface to those "devices" too.
> > > 
> > > This patch simply adds symbolic links to system/memory/memoryN, which
> > > the memory subsystem already provides for the online/offline interface
> > > of memory blocks.  So, it does not introduce a new interface, but guides
> > > users (and user tools) to know which memory blocks need to be off-lined
> > > in order to hot-delete any particular memory device PNP0C80:X.  A cpu
> > > device LNXCPU:X also has a similar symbolic link "sysdev" that links to
> > > system/cpu/cpuN.  I could not use the same "sysdev" for PNP0C80:X since
> > > it typically associates with multiple memory blocks.
> > > 
> > > I thought about using glue.c to create symbolic links between memoryN
> > > and PNP0C80:X.  However, it has an ordering issue.  During boot-time,
> > > memoryN gets created before PNP0C80:X.  But during hot-add, PNP0C80:X
> > > gets created before memoryN.
> > 
> > Quite frankly, this sounds like a bug to me.  Namely, what is memoryN really
> > good for without PNP0C80:X?  If it is not good for anything in that case,
> > it should never be created befor PNP0C80:X.
> 
> memoryN works without PNP0C80:X and does not depend on ACPI.  A memoryN
> represents a memblk, each of which is merely a 128MB (in case of x86) of
> memory chunk sliced from the entire memory ranges.

Ah.  Why is it exported this way?

> This is why it is
> hard to associate between memoryN and PNP0C80:X without these symbolic
> links (otherwise, you will have to calculate from memory address.)

Sure.

> The memory subsystem also obtains the memory ranges from EFI or e820
> during boot, and ACPI is not necessary to construct memoryN at boot.
> Since EFI / e820 only provides the boot-time configuration, ACPI is used
> to update the memory ranges during hot-add/delete.
> 
> > > This patch calls
> > > acpi_setup_mem_blk_links() in a point that solves this ordering issue
> > > since this point guarantees that both memoryN and PNP0C80X are created
> > > for both boot-time and hot-add.
> > 
> > I would prefer the ordering of creation to be the same in both cases.
> > Otherwise it really looks like we need to work around a problem that we're
> > creating for ourselves.
> > 
> > How exactly are memoryN created during boot?
> 
> memoryN is created in memory_dev_init().  This is even before
> do_initcalls().  I'd also prefer the same ordering, but I felt that
> changing this ordering would be rather challenging.
> 
> do_basic_setup()
>   driver_init()
>     memory_dev_init()

What about having a "physical memory module" device that will be associated
with those memory blocks (and will have links to them from its directory in
sysfs) and that will be pointed to by the "physical_node" pointer from
under PNP0C80:X?  Then, it may have a driver that will do the online/offline
(and export the interface for that to user space through sysfs) and will
interact with the ACPI interface provided by PNP0C80:X.

Thanks,
Rafael
Toshi Kani March 27, 2013, 5:13 p.m. UTC | #8
On Wed, 2013-03-27 at 00:39 +0100, Rafael J. Wysocki wrote:
> On Tuesday, March 26, 2013 04:59:36 PM Toshi Kani wrote:
> > On Tue, 2013-03-26 at 23:39 +0100, Rafael J. Wysocki wrote:
> > > On Tuesday, March 26, 2013 09:42:06 AM Toshi Kani wrote:
> > > > On Tue, 2013-03-26 at 14:04 +0100, Rafael J. Wysocki wrote:
> > > > > On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> > > > > > In order to eject a memory device object represented as "PNP0C80:%d"
> > > > > > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > > > > > be off-lined.  However, there is no user friendly way to correlate
> > > > > > between a memory device object and its memblocks in sysfs.
> > > > > > 
> > > > > > This patch creates sysfs links to memblocks under a memory device
> > > > > > object so that a user can easily checks and manipulates its memblocks
> > > > > > in sysfs.
> > > > > > 
> > > > > > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > > > > > the following two links are created under PNP0C80:05.  This allows
> > > > > > a user to access memory8/9 directly from PNP0C80:05.
> > > > > > 
> > > > > >   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> > > > > >   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> > > > > >   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > > > > > 
> > > > > > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > > > > 
> > > > > Here I have some doubts.
> > > > > 
> > > > > This adds a very specific interface for user space that we're going to need to
> > > > > maintain going forward if the user space starts to use it.  However, it kind of
> > > > > duplicates the existing "physical_node" interface that we have for "regular"
> > > > > devices.
> > > > > 
> > > > > So, if possible, I'd like the memory subsystem to utilize the existing
> > > > > interface instead of creating an entirely new one.  Namely, why don't we create
> > > > > a struct device-based object for each memory block and associated those new
> > > > > "devices" with the PNP0C80 ACPI object through the functions in glue.c?
> > > > > Then, we could add an "offline/online" interface to those "devices" too.
> > > > 
> > > > This patch simply adds symbolic links to system/memory/memoryN, which
> > > > the memory subsystem already provides for the online/offline interface
> > > > of memory blocks.  So, it does not introduce a new interface, but guides
> > > > users (and user tools) to know which memory blocks need to be off-lined
> > > > in order to hot-delete any particular memory device PNP0C80:X.  A cpu
> > > > device LNXCPU:X also has a similar symbolic link "sysdev" that links to
> > > > system/cpu/cpuN.  I could not use the same "sysdev" for PNP0C80:X since
> > > > it typically associates with multiple memory blocks.
> > > > 
> > > > I thought about using glue.c to create symbolic links between memoryN
> > > > and PNP0C80:X.  However, it has an ordering issue.  During boot-time,
> > > > memoryN gets created before PNP0C80:X.  But during hot-add, PNP0C80:X
> > > > gets created before memoryN.
> > > 
> > > Quite frankly, this sounds like a bug to me.  Namely, what is memoryN really
> > > good for without PNP0C80:X?  If it is not good for anything in that case,
> > > it should never be created befor PNP0C80:X.
> > 
> > memoryN works without PNP0C80:X and does not depend on ACPI.  A memoryN
> > represents a memblk, each of which is merely a 128MB (in case of x86) of
> > memory chunk sliced from the entire memory ranges.
> 
> Ah.  Why is it exported this way?

This is because a memblk is the unit of the Sparse Memory design, which
allows discontinuous memory ranges (ex. NUMA) and online/offline
operations.  So, it makes sense for the memory subsystem to export this
interface, although I feel that 128MB is rather small these days.

> > This is why it is
> > hard to associate between memoryN and PNP0C80:X without these symbolic
> > links (otherwise, you will have to calculate from memory address.)
> 
> Sure.
> 
> > The memory subsystem also obtains the memory ranges from EFI or e820
> > during boot, and ACPI is not necessary to construct memoryN at boot.
> > Since EFI / e820 only provides the boot-time configuration, ACPI is used
> > to update the memory ranges during hot-add/delete.
> > 
> > > > This patch calls
> > > > acpi_setup_mem_blk_links() in a point that solves this ordering issue
> > > > since this point guarantees that both memoryN and PNP0C80X are created
> > > > for both boot-time and hot-add.
> > > 
> > > I would prefer the ordering of creation to be the same in both cases.
> > > Otherwise it really looks like we need to work around a problem that we're
> > > creating for ourselves.
> > > 
> > > How exactly are memoryN created during boot?
> > 
> > memoryN is created in memory_dev_init().  This is even before
> > do_initcalls().  I'd also prefer the same ordering, but I felt that
> > changing this ordering would be rather challenging.
> > 
> > do_basic_setup()
> >   driver_init()
> >     memory_dev_init()
> 
> What about having a "physical memory module" device that will be associated
> with those memory blocks (and will have links to them from its directory in
> sysfs) and that will be pointed to by the "physical_node" pointer from
> under PNP0C80:X?  Then, it may have a driver that will do the online/offline
> (and export the interface for that to user space through sysfs) and will
> interact with the ACPI interface provided by PNP0C80:X.

It sounds interesting idea, but I think it has several issues:

 - Maintenance: It is not clear to me who owns this physical device.
From the memory subsystem perspective, a memblk is an object that is
managed like a device internally.  So, I do not think the memory
subsystem needs this.  If ACPI owns this, it adds complication to us and
we need to keep up with the memory changes.    
 - Rollback: As offline/online proceeds on each memblk, the driver has
to support rollback when one of memblks failed to offline.  This seems
to against the direction of this approach (no rollback in the kernel).
 - Ordering issue: Creating symbolic links among 3 types of devices
(PNP0C80, memblk, new physical device) further complicates the ordering
issue. 
 - Representation: PNP0C80:X represents the state of an ACPI PNP0C80
device.  memoryN represents the state of a memblk.  It is not clear what
this new physical device really represents when there isn't such device
visible to users.  It may be seen as duplication.

Frankly, if we are to provide a good sysfs UI for hot-delete, I'd prefer
to go with my updated patchset, which keeps user to allow doing "echo 1
> PNP0C80:X/eject", and all associated memblks will be offlined
together.  I think the user space approach will require good management
tools to take care of the UI issue, and the kernel just has to provide
the necessary info to the user space, which this patch is intended.

Thanks,
-Toshi

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael Wysocki April 11, 2013, 10:23 p.m. UTC | #9
Sorry for the delayed response.

On Wednesday, March 27, 2013 11:13:49 AM Toshi Kani wrote:
> On Wed, 2013-03-27 at 00:39 +0100, Rafael J. Wysocki wrote:
> > On Tuesday, March 26, 2013 04:59:36 PM Toshi Kani wrote:
> > > On Tue, 2013-03-26 at 23:39 +0100, Rafael J. Wysocki wrote:
> > > > On Tuesday, March 26, 2013 09:42:06 AM Toshi Kani wrote:
> > > > > On Tue, 2013-03-26 at 14:04 +0100, Rafael J. Wysocki wrote:
> > > > > > On Monday, February 25, 2013 02:02:10 PM Toshi Kani wrote:
> > > > > > > In order to eject a memory device object represented as "PNP0C80:%d"
> > > > > > > in sysfs, its associated memblocks (system/memory/memory%d) need to
> > > > > > > be off-lined.  However, there is no user friendly way to correlate
> > > > > > > between a memory device object and its memblocks in sysfs.
> > > > > > > 
> > > > > > > This patch creates sysfs links to memblocks under a memory device
> > > > > > > object so that a user can easily checks and manipulates its memblocks
> > > > > > > in sysfs.
> > > > > > > 
> > > > > > > For example, when PNP0C80:05 is associated with memory8 and memory9,
> > > > > > > the following two links are created under PNP0C80:05.  This allows
> > > > > > > a user to access memory8/9 directly from PNP0C80:05.
> > > > > > > 
> > > > > > >   # ll /sys/devices/LNXSYSTM:00/LNXSYBUS:00/PNP0C80:05
> > > > > > >   lrwxrwxrwx. memory8 -> ../../../system/memory/memory8
> > > > > > >   lrwxrwxrwx. memory9 -> ../../../system/memory/memory9
> > > > > > > 
> > > > > > > Signed-off-by: Toshi Kani <toshi.kani@hp.com>
> > > > > > 
> > > > > > Here I have some doubts.
> > > > > > 
> > > > > > This adds a very specific interface for user space that we're going to need to
> > > > > > maintain going forward if the user space starts to use it.  However, it kind of
> > > > > > duplicates the existing "physical_node" interface that we have for "regular"
> > > > > > devices.
> > > > > > 
> > > > > > So, if possible, I'd like the memory subsystem to utilize the existing
> > > > > > interface instead of creating an entirely new one.  Namely, why don't we create
> > > > > > a struct device-based object for each memory block and associated those new
> > > > > > "devices" with the PNP0C80 ACPI object through the functions in glue.c?
> > > > > > Then, we could add an "offline/online" interface to those "devices" too.
> > > > > 
> > > > > This patch simply adds symbolic links to system/memory/memoryN, which
> > > > > the memory subsystem already provides for the online/offline interface
> > > > > of memory blocks.  So, it does not introduce a new interface, but guides
> > > > > users (and user tools) to know which memory blocks need to be off-lined
> > > > > in order to hot-delete any particular memory device PNP0C80:X.  A cpu
> > > > > device LNXCPU:X also has a similar symbolic link "sysdev" that links to
> > > > > system/cpu/cpuN.  I could not use the same "sysdev" for PNP0C80:X since
> > > > > it typically associates with multiple memory blocks.
> > > > > 
> > > > > I thought about using glue.c to create symbolic links between memoryN
> > > > > and PNP0C80:X.  However, it has an ordering issue.  During boot-time,
> > > > > memoryN gets created before PNP0C80:X.  But during hot-add, PNP0C80:X
> > > > > gets created before memoryN.
> > > > 
> > > > Quite frankly, this sounds like a bug to me.  Namely, what is memoryN really
> > > > good for without PNP0C80:X?  If it is not good for anything in that case,
> > > > it should never be created befor PNP0C80:X.
> > > 
> > > memoryN works without PNP0C80:X and does not depend on ACPI.  A memoryN
> > > represents a memblk, each of which is merely a 128MB (in case of x86) of
> > > memory chunk sliced from the entire memory ranges.
> > 
> > Ah.  Why is it exported this way?
> 
> This is because a memblk is the unit of the Sparse Memory design, which
> allows discontinuous memory ranges (ex. NUMA) and online/offline
> operations.  So, it makes sense for the memory subsystem to export this
> interface, although I feel that 128MB is rather small these days.
> 
> > > This is why it is
> > > hard to associate between memoryN and PNP0C80:X without these symbolic
> > > links (otherwise, you will have to calculate from memory address.)
> > 
> > Sure.
> > 
> > > The memory subsystem also obtains the memory ranges from EFI or e820
> > > during boot, and ACPI is not necessary to construct memoryN at boot.
> > > Since EFI / e820 only provides the boot-time configuration, ACPI is used
> > > to update the memory ranges during hot-add/delete.
> > > 
> > > > > This patch calls
> > > > > acpi_setup_mem_blk_links() in a point that solves this ordering issue
> > > > > since this point guarantees that both memoryN and PNP0C80X are created
> > > > > for both boot-time and hot-add.
> > > > 
> > > > I would prefer the ordering of creation to be the same in both cases.
> > > > Otherwise it really looks like we need to work around a problem that we're
> > > > creating for ourselves.
> > > > 
> > > > How exactly are memoryN created during boot?
> > > 
> > > memoryN is created in memory_dev_init().  This is even before
> > > do_initcalls().  I'd also prefer the same ordering, but I felt that
> > > changing this ordering would be rather challenging.
> > > 
> > > do_basic_setup()
> > >   driver_init()
> > >     memory_dev_init()
> > 
> > What about having a "physical memory module" device that will be associated
> > with those memory blocks (and will have links to them from its directory in
> > sysfs) and that will be pointed to by the "physical_node" pointer from
> > under PNP0C80:X?  Then, it may have a driver that will do the online/offline
> > (and export the interface for that to user space through sysfs) and will
> > interact with the ACPI interface provided by PNP0C80:X.
> 
> It sounds interesting idea, but I think it has several issues:
> 
>  - Maintenance: It is not clear to me who owns this physical device.
> From the memory subsystem perspective, a memblk is an object that is
> managed like a device internally.  So, I do not think the memory
> subsystem needs this.

It doesn't, if "offline" is implemented at the memblk level.  Which I'm not
sure is practical, because multiple memblks will be offlined/onlined
simultaneously in all realistic setups.

> If ACPI owns this, it adds complication to us and
> we need to keep up with the memory changes.

Well, "owning" doesn't really apply to any kernel code in any meaningful
way.  I'm familiar with that corporate jargon in which "to own" means
"to be responsible for" roughly, but even that is not applicable to kernel
code, which is not owned by anyone in any form.

The maintenance of kernel code is more like taking care of it and there are
no hard rules about who should or should not be maintaining this or that
piece of it.

I could maintain that driver just fine, so that's not really an issue.

>  - Rollback: As offline/online proceeds on each memblk, the driver has
> to support rollback when one of memblks failed to offline.  This seems
> to against the direction of this approach (no rollback in the kernel).

Depending on what you mean by "rollback".  Technically, if one memblk
fails to offline, there's no need to online the other memblks that have
been offlined already in the same group.  They very well can stay online.

Yes, it would be convenient to online them back to have a well-defined state
after a failing attempt to offline stuff, but that's not a correctness issue.

>  - Ordering issue: Creating symbolic links among 3 types of devices
> (PNP0C80, memblk, new physical device) further complicates the ordering
> issue. 

That's a valid one.  But do we need to link the new device to memblks in
sysfs?

>  - Representation: PNP0C80:X represents the state of an ACPI PNP0C80
> device.  memoryN represents the state of a memblk.  It is not clear what
> this new physical device really represents when there isn't such device
> visible to users.  It may be seen as duplication.

Yes, it may.  That said, I'm not exactly sure why memblks are regarded as
a good way to represent hot-removable memory.  It looks like they are
arbitrary stuff without any relationship to real hardware.  And that's where
the source of the problem is, IMHO.

In any case, ACPI PNP0C80:X should not be regarded as a *device*.  It just
is an interface to a platform mechanism allowing us to do something to a real
device (e.g. eject it), but it is not that device in general.

> Frankly, if we are to provide a good sysfs UI for hot-delete, I'd prefer
> to go with my updated patchset, which keeps user to allow doing "echo 1
> > PNP0C80:X/eject", and all associated memblks will be offlined
> together.

Which isn't correct.  Memory offline shouldn't be associated with an ACPI
device object, because that operation doesn't belong there (eject should
live there, not offline).

> I think the user space approach will require good management
> tools to take care of the UI issue, and the kernel just has to provide
> the necessary info to the user space, which this patch is intended.

No doubt about that, but the question is how to provide that information.

Thanks,
Rafael
Toshi Kani April 12, 2013, 6:13 p.m. UTC | #10
On Fri, 2013-04-12 at 00:23 +0200, Rafael J. Wysocki wrote:
> Sorry for the delayed response.

No problem.  I know you are busy.  Thanks for the reply.

> On Wednesday, March 27, 2013 11:13:49 AM Toshi Kani wrote:
> > On Wed, 2013-03-27 at 00:39 +0100, Rafael J. Wysocki wrote:
 :
> > > 
> > > What about having a "physical memory module" device that will be associated
> > > with those memory blocks (and will have links to them from its directory in
> > > sysfs) and that will be pointed to by the "physical_node" pointer from
> > > under PNP0C80:X?  Then, it may have a driver that will do the online/offline
> > > (and export the interface for that to user space through sysfs) and will
> > > interact with the ACPI interface provided by PNP0C80:X.
> > 
> > It sounds interesting idea, but I think it has several issues:
> > 
> >  - Maintenance: It is not clear to me who owns this physical device.
> > From the memory subsystem perspective, a memblk is an object that is
> > managed like a device internally.  So, I do not think the memory
> > subsystem needs this.
> 
> It doesn't, if "offline" is implemented at the memblk level.  

Agreed.

> Which I'm not
> sure is practical, because multiple memblks will be offlined/onlined
> simultaneously in all realistic setups.

Yes, the current memblk size of 128MB on x86 is rather small.  I agree
it is unlikely that regular users use this interface directly.  I think
this interface is intended for developers and tools. 

> > If ACPI owns this, it adds complication to us and
> > we need to keep up with the memory changes.
> 
> Well, "owning" doesn't really apply to any kernel code in any meaningful
> way.  I'm familiar with that corporate jargon in which "to own" means
> "to be responsible for" roughly, but even that is not applicable to kernel
> code, which is not owned by anyone in any form.

Right.  (I think I spent too long in a corporate world...)  

> The maintenance of kernel code is more like taking care of it and there are
> no hard rules about who should or should not be maintaining this or that
> piece of it.
> 
> I could maintain that driver just fine, so that's not really an issue.

Well, I mentioned it because I thought you had concerned about it in
your earlier reply below.  I agree this is a non-issue as you do not see
as an issue. :)
https://lkml.org/lkml/2013/3/26/198

> >  - Rollback: As offline/online proceeds on each memblk, the driver has
> > to support rollback when one of memblks failed to offline.  This seems
> > to against the direction of this approach (no rollback in the kernel).
> 
> Depending on what you mean by "rollback".  Technically, if one memblk
> fails to offline, there's no need to online the other memblks that have
> been offlined already in the same group.  They very well can stay online.
> 
> Yes, it would be convenient to online them back to have a well-defined state
> after a failing attempt to offline stuff, but that's not a correctness issue.

This new device represents a single device to users, but is associated
with a set of memblks internally.  If an end result is some memblks
online and some off-line, I wonder how we represent the state of this
device...

> >  - Ordering issue: Creating symbolic links among 3 types of devices
> > (PNP0C80, memblk, new physical device) further complicates the ordering
> > issue. 
> 
> That's a valid one.  But do we need to link the new device to memblks in
> sysfs?

It depends on how well this new device can abstract internal memblks.
For instance, if a new device may have a mixed state (some memblks
online and some offline), it may need to link to the memblks so that
each memblk state can be managed.  The memblk interface also has other
attribute, such as movable and non-movable.

> >  - Representation: PNP0C80:X represents the state of an ACPI PNP0C80
> > device.  memoryN represents the state of a memblk.  It is not clear what
> > this new physical device really represents when there isn't such device
> > visible to users.  It may be seen as duplication.
> 
> Yes, it may.  That said, I'm not exactly sure why memblks are regarded as
> a good way to represent hot-removable memory.  It looks like they are
> arbitrary stuff without any relationship to real hardware.  And that's where
> the source of the problem is, IMHO.

I think it is designed to keep it platform-neutral.  I do not think it
is regarded as a good way to represent hot-removable memory for regular
users.

> In any case, ACPI PNP0C80:X should not be regarded as a *device*.  It just
> is an interface to a platform mechanism allowing us to do something to a real
> device (e.g. eject it), but it is not that device in general.

I agree.  I do not think PNP0C80:X is good for regular users, either.

> > Frankly, if we are to provide a good sysfs UI for hot-delete, I'd prefer
> > to go with my updated patchset, which keeps user to allow doing "echo 1
> > > PNP0C80:X/eject", and all associated memblks will be offlined
> > together.
> 
> Which isn't correct.  Memory offline shouldn't be associated with an ACPI
> device object, because that operation doesn't belong there (eject should
> live there, not offline).

I actually meant to say ACPI eject notification, not the sysfs eject of
PNP0C80X, as their requests are handled in the same way.

> > I think the user space approach will require good management
> > tools to take care of the UI issue, and the kernel just has to provide
> > the necessary info to the user space, which this patch is intended.
> 
> No doubt about that, but the question is how to provide that information.

There are basically 3 ways for users to manage memory hotplug
operations.  Let's look at them to see what we need.  In comparison, (U)
refers to the user space approach (user has to offline first), and (K)
refers to the kernel approach (the kernel does both offline and eject).

1. Management Console (ACPI notifications)
Platforms that support ACPI memory hotplug will support ACPI
notifications to initiate hot-add and hot-delete operations from the
management console (or doorbells, host CLIs for VMs).  It has the
platform knowledge, and does not depend on the kernel info to operate
(other than _OST).

(U) For hot-delete, users need to offline target memory beforehand by 2
or 3.  The management console is a single point of administration.

(K) Users can operate both hot-add and delete.  The kernel attempts to
offline upon the request.

2. Management Tool
Management tool can provide CLI/GUI to operate memory hot-add and
hot-delete.  The tool may interface to the management console and the
kernel.  The tool needs to have the hardware info in order to guide
users to a right target, which may be obtained from the management
console, predefined DB and sysfs.

(U) Management tool refers to ACPI PNP0C80:X devices to locate a right
target since they have _UID, _SUN and _STR, which identify their
hardware locations.  The tool also needs offline the memory associated
with a target PNP0C80;X, which can be done either by a new associated
device object or symbolic link to memblks in this patch.  In case of
memblks, the tool can perform rollback as necessary.  The tool then
ejects the target with sysfs eject of PNP0C80:X.

(K) Management tool sends a hotplug request through the management
console.  It may also interface with the kernel as necessary.

3. Sysfs
The sysfs interfaces can only support hot-delete & online/offline.
Users need to use 1 or 2 for hot-add.  It does not provide a single
point of administration.

(U) If designed properly, new device objects may help regular users to
find a right target to offline via sysfs.  Users can then eject it with
sysfs eject of PNP0C80 via a symbolic link.

(K) Users can request hot-delete with sysfs eject of PNP0C80:X.
However, it is unlikely that regular users can find a right target
without knowing what ACPI is.  

Thanks,
-Toshi

--
To unsubscribe from this list: send the line "unsubscribe linux-acpi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 3b3abbc..98477a5 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -28,6 +28,7 @@ 
  */
 
 #include <linux/acpi.h>
+#include <linux/memory.h>
 #include <linux/memory_hotplug.h>
 
 #include "internal.h"
@@ -168,6 +169,55 @@  static int acpi_memory_check_device(struct acpi_memory_device *mem_device)
 	return 0;
 }
 
+static void acpi_setup_mem_blk_links(struct acpi_memory_device *mem_device,
+		struct acpi_memory_info *info, bool add_links)
+{
+	struct memory_block *mem_blk = NULL;
+	struct mem_section *mem_sect;
+	unsigned long start_pfn, end_pfn, pfn;
+	unsigned long section_nr;
+	int ret;
+
+	start_pfn = PFN_DOWN(info->start_addr);
+	end_pfn = PFN_UP(info->start_addr + info->length-1);
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
+		section_nr = pfn_to_section_nr(pfn);
+
+		if (!present_section_nr(section_nr))
+			continue;
+
+		mem_sect = __nr_to_section(section_nr);
+
+		/* skip if the same memblock */
+		if (mem_blk)
+			if ((section_nr >= mem_blk->start_section_nr) &&
+			    (section_nr <= mem_blk->end_section_nr))
+				continue;
+
+		mem_blk = find_memory_block_hinted(mem_sect, mem_blk);
+		if (!mem_blk)
+			continue;
+
+		if (add_links) {
+			ret = sysfs_create_link_nowarn(
+				&mem_device->device->dev.kobj,
+				&mem_blk->dev.kobj,
+				kobject_name(&mem_blk->dev.kobj));
+			if (ret && ret != -EEXIST)
+				dev_err(&mem_device->device->dev,
+					"Failed to create sysfs link %s\n",
+					kobject_name(&mem_blk->dev.kobj));
+		} else {
+			sysfs_remove_link(&mem_device->device->dev.kobj,
+				kobject_name(&mem_blk->dev.kobj));
+		}
+	}
+
+	if (mem_blk)
+		kobject_put(&mem_blk->dev.kobj);
+}
+
 static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 {
 	int result, num_enabled = 0;
@@ -207,6 +257,9 @@  static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
 			continue;
 		}
 
+		/* Create sysfs links to its mem_blk devices */
+		acpi_setup_mem_blk_links(mem_device, info, true);
+
 		if (!result)
 			info->enabled = 1;
 		/*
@@ -241,6 +294,9 @@  static int acpi_memory_remove_memory(struct acpi_memory_device *mem_device)
 			/* The kernel does not use this memory block */
 			continue;
 
+		/* Remove sysfs links to its mem_blk devices */
+		acpi_setup_mem_blk_links(mem_device, info, false);
+
 		if (!info->enabled)
 			/*
 			 * The kernel uses this memory block, but it may be not