diff mbox series

[v2,4/8] mm/memory_hotplug: Create memory block devices after arch_add_memory()

Message ID 20190507183804.5512-5-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series [v2,1/8] mm/memory_hotplug: Simplify and fix check_hotplug_memory_range() | expand

Commit Message

David Hildenbrand May 7, 2019, 6:38 p.m. UTC
Only memory to be added to the buddy and to be onlined/offlined by
user space using memory block devices needs (and should have!) memory
block devices.

Factor out creation of memory block devices Create all devices after
arch_add_memory() succeeded. We can later drop the want_memblock parameter,
because it is now effectively stale.

Only after memory block devices have been added, memory can be onlined
by user space. This implies, that memory is not visible to user space at
all before arch_add_memory() succeeded.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rafael@kernel.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Banman <andrew.banman@hpe.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Cc: Qian Cai <cai@lca.pw>
Cc: Wei Yang <richard.weiyang@gmail.com>
Cc: Arun KS <arunks@codeaurora.org>
Cc: Mathieu Malaterre <malat@debian.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
 include/linux/memory.h |  2 +-
 mm/memory_hotplug.c    | 15 ++++-----
 3 files changed, 53 insertions(+), 34 deletions(-)

Comments

Dan Williams May 7, 2019, 9:17 p.m. UTC | #1
On Tue, May 7, 2019 at 11:38 AM David Hildenbrand <david@redhat.com> wrote:
>
> Only memory to be added to the buddy and to be onlined/offlined by
> user space using memory block devices needs (and should have!) memory
> block devices.
>
> Factor out creation of memory block devices Create all devices after
> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
> because it is now effectively stale.
>
> Only after memory block devices have been added, memory can be onlined
> by user space. This implies, that memory is not visible to user space at
> all before arch_add_memory() succeeded.

Nice!

>
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Andrew Banman <andrew.banman@hpe.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
> Cc: Qian Cai <cai@lca.pw>
> Cc: Wei Yang <richard.weiyang@gmail.com>
> Cc: Arun KS <arunks@codeaurora.org>
> Cc: Mathieu Malaterre <malat@debian.org>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
>  include/linux/memory.h |  2 +-
>  mm/memory_hotplug.c    | 15 ++++-----
>  3 files changed, 53 insertions(+), 34 deletions(-)
>
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 6e0cb4fda179..862c202a18ca 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
>         return 0;
>  }
>
> +static void unregister_memory(struct memory_block *memory)
> +{
> +       BUG_ON(memory->dev.bus != &memory_subsys);

Given this should never happen and only a future kernel developer
might trip over it, do we really need to kill that developer's
machine? I.e. s/BUG/WARN/? I guess an argument can be made to move
such a change that to a follow-on patch since you're just preserving
existing behavior, but I figure might as well address these as the
code is refactored.

> +
> +       /* drop the ref. we got via find_memory_block() */
> +       put_device(&memory->dev);
> +       device_unregister(&memory->dev);
> +}
> +
>  /*
> - * need an interface for the VM to add new memory regions,
> - * but without onlining it.
> + * Create memory block devices for the given memory area. Start and size
> + * have to be aligned to memory block granularity. Memory block devices
> + * will be initialized as offline.
>   */
> -int hotplug_memory_register(int nid, struct mem_section *section)
> +int hotplug_memory_register(unsigned long start, unsigned long size)
>  {
> -       int ret = 0;
> +       unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
> +       unsigned long start_pfn = PFN_DOWN(start);
> +       unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
> +       unsigned long pfn;
>         struct memory_block *mem;
> +       int ret = 0;
>
> -       mutex_lock(&mem_sysfs_mutex);
> +       BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
> +       BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));

Perhaps:

    if (WARN_ON(...))
        return -EINVAL;

>
> -       mem = find_memory_block(section);
> -       if (mem) {
> -               mem->section_count++;
> -               put_device(&mem->dev);
> -       } else {
> -               ret = init_memory_block(&mem, section, MEM_OFFLINE);
> +       mutex_lock(&mem_sysfs_mutex);
> +       for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
> +               mem = find_memory_block(__pfn_to_section(pfn));
> +               if (mem) {
> +                       WARN_ON_ONCE(false);

?? Isn't that a nop?

> +                       put_device(&mem->dev);
> +                       continue;
> +               }
> +               ret = init_memory_block(&mem, __pfn_to_section(pfn),
> +                                       MEM_OFFLINE);
>                 if (ret)
> -                       goto out;
> -               mem->section_count++;
> +                       break;
> +               mem->section_count = memory_block_size_bytes() /
> +                                    MIN_MEMORY_BLOCK_SIZE;
> +       }
> +       if (ret) {
> +               end_pfn = pfn;
> +               for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
> +                       mem = find_memory_block(__pfn_to_section(pfn));
> +                       if (!mem)
> +                               continue;
> +                       mem->section_count = 0;
> +                       unregister_memory(mem);
> +               }
>         }
> -
> -out:
>         mutex_unlock(&mem_sysfs_mutex);
>         return ret;
>  }
>
> -static void
> -unregister_memory(struct memory_block *memory)
> -{
> -       BUG_ON(memory->dev.bus != &memory_subsys);
> -
> -       /* drop the ref. we got via find_memory_block() */
> -       put_device(&memory->dev);
> -       device_unregister(&memory->dev);
> -}
> -
> -void unregister_memory_section(struct mem_section *section)
> +static int remove_memory_section(struct mem_section *section)
>  {
>         struct memory_block *mem;
>
> diff --git a/include/linux/memory.h b/include/linux/memory.h
> index 474c7c60c8f2..95505fbb5f85 100644
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -111,7 +111,7 @@ extern int register_memory_notifier(struct notifier_block *nb);
>  extern void unregister_memory_notifier(struct notifier_block *nb);
>  extern int register_memory_isolate_notifier(struct notifier_block *nb);
>  extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> -int hotplug_memory_register(int nid, struct mem_section *section);
> +int hotplug_memory_register(unsigned long start, unsigned long size);
>  extern void unregister_memory_section(struct mem_section *);
>  extern int memory_dev_init(void);
>  extern int memory_notify(unsigned long val, void *v);
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 7b5439839d67..e1637c8a0723 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -258,13 +258,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
>                 return -EEXIST;
>
>         ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
> -       if (ret < 0)
> -               return ret;
> -
> -       if (!want_memblock)
> -               return 0;
> -
> -       return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
> +       return ret < 0 ? ret : 0;
>  }
>
>  /*
> @@ -1106,6 +1100,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
>         if (ret < 0)
>                 goto error;
>
> +       /* create memory block devices after memory was added */
> +       ret = hotplug_memory_register(start, size);
> +       if (ret) {
> +               arch_remove_memory(nid, start, size, NULL);
> +               goto error;
> +       }
> +
>         if (new_node) {
>                 /* If sysfs file of new node can't be created, cpu on the node
>                  * can't be hot-added. There is no rollback way now.
> --
> 2.20.1
>
David Hildenbrand May 7, 2019, 9:27 p.m. UTC | #2
>> +static void unregister_memory(struct memory_block *memory)
>> +{
>> +       BUG_ON(memory->dev.bus != &memory_subsys);
> 
> Given this should never happen and only a future kernel developer
> might trip over it, do we really need to kill that developer's
> machine? I.e. s/BUG/WARN/? I guess an argument can be made to move
> such a change that to a follow-on patch since you're just preserving
> existing behavior, but I figure might as well address these as the
> code is refactored.

I assume only

if (WARN ...)
	return;

makes sense then, right?

> 
>> +
>> +       /* drop the ref. we got via find_memory_block() */
>> +       put_device(&memory->dev);
>> +       device_unregister(&memory->dev);
>> +}
>> +
>>  /*
>> - * need an interface for the VM to add new memory regions,
>> - * but without onlining it.
>> + * Create memory block devices for the given memory area. Start and size
>> + * have to be aligned to memory block granularity. Memory block devices
>> + * will be initialized as offline.
>>   */
>> -int hotplug_memory_register(int nid, struct mem_section *section)
>> +int hotplug_memory_register(unsigned long start, unsigned long size)
>>  {
>> -       int ret = 0;
>> +       unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
>> +       unsigned long start_pfn = PFN_DOWN(start);
>> +       unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
>> +       unsigned long pfn;
>>         struct memory_block *mem;
>> +       int ret = 0;
>>
>> -       mutex_lock(&mem_sysfs_mutex);
>> +       BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
>> +       BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
> 
> Perhaps:
> 
>     if (WARN_ON(...))
>         return -EINVAL;
> 

Yes, guess this souldn't hurt.

>>
>> -       mem = find_memory_block(section);
>> -       if (mem) {
>> -               mem->section_count++;
>> -               put_device(&mem->dev);
>> -       } else {
>> -               ret = init_memory_block(&mem, section, MEM_OFFLINE);
>> +       mutex_lock(&mem_sysfs_mutex);
>> +       for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
>> +               mem = find_memory_block(__pfn_to_section(pfn));
>> +               if (mem) {
>> +                       WARN_ON_ONCE(false);
> 
> ?? Isn't that a nop?

Yes, that makes no sense :)

Thanks!
David Hildenbrand May 8, 2019, 8:35 a.m. UTC | #3
On 07.05.19 20:38, David Hildenbrand wrote:
> Only memory to be added to the buddy and to be onlined/offlined by
> user space using memory block devices needs (and should have!) memory
> block devices.
> 
> Factor out creation of memory block devices Create all devices after
> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
> because it is now effectively stale.
> 
> Only after memory block devices have been added, memory can be onlined
> by user space. This implies, that memory is not visible to user space at
> all before arch_add_memory() succeeded.
> 
> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Ingo Molnar <mingo@kernel.org>
> Cc: Andrew Banman <andrew.banman@hpe.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Michal Hocko <mhocko@suse.com>
> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
> Cc: Qian Cai <cai@lca.pw>
> Cc: Wei Yang <richard.weiyang@gmail.com>
> Cc: Arun KS <arunks@codeaurora.org>
> Cc: Mathieu Malaterre <malat@debian.org>
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
>  include/linux/memory.h |  2 +-
>  mm/memory_hotplug.c    | 15 ++++-----
>  3 files changed, 53 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 6e0cb4fda179..862c202a18ca 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
>  	return 0;
>  }
>  
> +static void unregister_memory(struct memory_block *memory)
> +{
> +	BUG_ON(memory->dev.bus != &memory_subsys);
> +
> +	/* drop the ref. we got via find_memory_block() */
> +	put_device(&memory->dev);
> +	device_unregister(&memory->dev);
> +}
> +
>  /*
> - * need an interface for the VM to add new memory regions,
> - * but without onlining it.
> + * Create memory block devices for the given memory area. Start and size
> + * have to be aligned to memory block granularity. Memory block devices
> + * will be initialized as offline.
>   */
> -int hotplug_memory_register(int nid, struct mem_section *section)
> +int hotplug_memory_register(unsigned long start, unsigned long size)
>  {
> -	int ret = 0;
> +	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
> +	unsigned long start_pfn = PFN_DOWN(start);
> +	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
> +	unsigned long pfn;
>  	struct memory_block *mem;
> +	int ret = 0;
>  
> -	mutex_lock(&mem_sysfs_mutex);
> +	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
> +	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
>  
> -	mem = find_memory_block(section);
> -	if (mem) {
> -		mem->section_count++;
> -		put_device(&mem->dev);
> -	} else {
> -		ret = init_memory_block(&mem, section, MEM_OFFLINE);
> +	mutex_lock(&mem_sysfs_mutex);
> +	for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
> +		mem = find_memory_block(__pfn_to_section(pfn));
> +		if (mem) {
> +			WARN_ON_ONCE(false);
> +			put_device(&mem->dev);
> +			continue;
> +		}
> +		ret = init_memory_block(&mem, __pfn_to_section(pfn),
> +					MEM_OFFLINE);
>  		if (ret)
> -			goto out;
> -		mem->section_count++;
> +			break;
> +		mem->section_count = memory_block_size_bytes() /
> +				     MIN_MEMORY_BLOCK_SIZE;
> +	}
> +	if (ret) {
> +		end_pfn = pfn;
> +		for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
> +			mem = find_memory_block(__pfn_to_section(pfn));
> +			if (!mem)
> +				continue;
> +			mem->section_count = 0;
> +			unregister_memory(mem);
> +		}
>  	}
> -
> -out:
>  	mutex_unlock(&mem_sysfs_mutex);
>  	return ret;
>  }
>  
> -static void
> -unregister_memory(struct memory_block *memory)
> -{
> -	BUG_ON(memory->dev.bus != &memory_subsys);
> -
> -	/* drop the ref. we got via find_memory_block() */
> -	put_device(&memory->dev);
> -	device_unregister(&memory->dev);
> -}
> -
> -void unregister_memory_section(struct mem_section *section)
> +static int remove_memory_section(struct mem_section *section)
>  {

The function change is misplaces in this patch will drop it so this
patch compiles without the other patches.
Wei Yang May 9, 2019, 12:43 p.m. UTC | #4
On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>Only memory to be added to the buddy and to be onlined/offlined by
>user space using memory block devices needs (and should have!) memory
>block devices.
>
>Factor out creation of memory block devices Create all devices after
>arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>because it is now effectively stale.
>
>Only after memory block devices have been added, memory can be onlined
>by user space. This implies, that memory is not visible to user space at
>all before arch_add_memory() succeeded.
>
>Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>Cc: David Hildenbrand <david@redhat.com>
>Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>Cc: Andrew Morton <akpm@linux-foundation.org>
>Cc: Ingo Molnar <mingo@kernel.org>
>Cc: Andrew Banman <andrew.banman@hpe.com>
>Cc: Oscar Salvador <osalvador@suse.de>
>Cc: Michal Hocko <mhocko@suse.com>
>Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>Cc: Qian Cai <cai@lca.pw>
>Cc: Wei Yang <richard.weiyang@gmail.com>
>Cc: Arun KS <arunks@codeaurora.org>
>Cc: Mathieu Malaterre <malat@debian.org>
>Signed-off-by: David Hildenbrand <david@redhat.com>
>---
> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
> include/linux/memory.h |  2 +-
> mm/memory_hotplug.c    | 15 ++++-----
> 3 files changed, 53 insertions(+), 34 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 6e0cb4fda179..862c202a18ca 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
> 	return 0;
> }
> 
>+static void unregister_memory(struct memory_block *memory)
>+{
>+	BUG_ON(memory->dev.bus != &memory_subsys);
>+
>+	/* drop the ref. we got via find_memory_block() */
>+	put_device(&memory->dev);
>+	device_unregister(&memory->dev);
>+}
>+
> /*
>- * need an interface for the VM to add new memory regions,
>- * but without onlining it.
>+ * Create memory block devices for the given memory area. Start and size
>+ * have to be aligned to memory block granularity. Memory block devices
>+ * will be initialized as offline.
>  */
>-int hotplug_memory_register(int nid, struct mem_section *section)
>+int hotplug_memory_register(unsigned long start, unsigned long size)
> {
>-	int ret = 0;
>+	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
>+	unsigned long start_pfn = PFN_DOWN(start);
>+	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
>+	unsigned long pfn;
> 	struct memory_block *mem;
>+	int ret = 0;
> 
>-	mutex_lock(&mem_sysfs_mutex);
>+	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
>+	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));

After this change, the call flow looks like this:

add_memory_resource
    check_hotplug_memory_range
    hotplug_memory_register

Since in check_hotplug_memory_range() has checked the boundary, do we need to
check here again?
David Hildenbrand May 9, 2019, 12:50 p.m. UTC | #5
On 09.05.19 14:43, Wei Yang wrote:
> On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>> Only memory to be added to the buddy and to be onlined/offlined by
>> user space using memory block devices needs (and should have!) memory
>> block devices.
>>
>> Factor out creation of memory block devices Create all devices after
>> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>> because it is now effectively stale.
>>
>> Only after memory block devices have been added, memory can be onlined
>> by user space. This implies, that memory is not visible to user space at
>> all before arch_add_memory() succeeded.
>>
>> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Ingo Molnar <mingo@kernel.org>
>> Cc: Andrew Banman <andrew.banman@hpe.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>> Cc: Qian Cai <cai@lca.pw>
>> Cc: Wei Yang <richard.weiyang@gmail.com>
>> Cc: Arun KS <arunks@codeaurora.org>
>> Cc: Mathieu Malaterre <malat@debian.org>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
>> include/linux/memory.h |  2 +-
>> mm/memory_hotplug.c    | 15 ++++-----
>> 3 files changed, 53 insertions(+), 34 deletions(-)
>>
>> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>> index 6e0cb4fda179..862c202a18ca 100644
>> --- a/drivers/base/memory.c
>> +++ b/drivers/base/memory.c
>> @@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
>> 	return 0;
>> }
>>
>> +static void unregister_memory(struct memory_block *memory)
>> +{
>> +	BUG_ON(memory->dev.bus != &memory_subsys);
>> +
>> +	/* drop the ref. we got via find_memory_block() */
>> +	put_device(&memory->dev);
>> +	device_unregister(&memory->dev);
>> +}
>> +
>> /*
>> - * need an interface for the VM to add new memory regions,
>> - * but without onlining it.
>> + * Create memory block devices for the given memory area. Start and size
>> + * have to be aligned to memory block granularity. Memory block devices
>> + * will be initialized as offline.
>>  */
>> -int hotplug_memory_register(int nid, struct mem_section *section)
>> +int hotplug_memory_register(unsigned long start, unsigned long size)
>> {
>> -	int ret = 0;
>> +	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
>> +	unsigned long start_pfn = PFN_DOWN(start);
>> +	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
>> +	unsigned long pfn;
>> 	struct memory_block *mem;
>> +	int ret = 0;
>>
>> -	mutex_lock(&mem_sysfs_mutex);
>> +	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
>> +	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
> 
> After this change, the call flow looks like this:
> 
> add_memory_resource
>     check_hotplug_memory_range
>     hotplug_memory_register
> 
> Since in check_hotplug_memory_range() has checked the boundary, do we need to
> check here again?
> 

I prefer to check for such requirements explicitly in applicable places,
especially if they are placed in different files. Makes code easier to
get. WARN_ON_ONCE will indicate that this has to be assured by the caller.

Thanks!
Wei Yang May 9, 2019, 1:55 p.m. UTC | #6
On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>Only memory to be added to the buddy and to be onlined/offlined by
>user space using memory block devices needs (and should have!) memory
>block devices.
>
>Factor out creation of memory block devices Create all devices after
>arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>because it is now effectively stale.
>
>Only after memory block devices have been added, memory can be onlined
>by user space. This implies, that memory is not visible to user space at
>all before arch_add_memory() succeeded.
>
>Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>Cc: David Hildenbrand <david@redhat.com>
>Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>Cc: Andrew Morton <akpm@linux-foundation.org>
>Cc: Ingo Molnar <mingo@kernel.org>
>Cc: Andrew Banman <andrew.banman@hpe.com>
>Cc: Oscar Salvador <osalvador@suse.de>
>Cc: Michal Hocko <mhocko@suse.com>
>Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>Cc: Qian Cai <cai@lca.pw>
>Cc: Wei Yang <richard.weiyang@gmail.com>
>Cc: Arun KS <arunks@codeaurora.org>
>Cc: Mathieu Malaterre <malat@debian.org>
>Signed-off-by: David Hildenbrand <david@redhat.com>
>---
> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
> include/linux/memory.h |  2 +-
> mm/memory_hotplug.c    | 15 ++++-----
> 3 files changed, 53 insertions(+), 34 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 6e0cb4fda179..862c202a18ca 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
> 	return 0;
> }
> 
>+static void unregister_memory(struct memory_block *memory)
>+{
>+	BUG_ON(memory->dev.bus != &memory_subsys);
>+
>+	/* drop the ref. we got via find_memory_block() */
>+	put_device(&memory->dev);
>+	device_unregister(&memory->dev);
>+}
>+
> /*
>- * need an interface for the VM to add new memory regions,
>- * but without onlining it.
>+ * Create memory block devices for the given memory area. Start and size
>+ * have to be aligned to memory block granularity. Memory block devices
>+ * will be initialized as offline.
>  */
>-int hotplug_memory_register(int nid, struct mem_section *section)
>+int hotplug_memory_register(unsigned long start, unsigned long size)
> {
>-	int ret = 0;
>+	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
>+	unsigned long start_pfn = PFN_DOWN(start);
>+	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
>+	unsigned long pfn;
> 	struct memory_block *mem;
>+	int ret = 0;
> 
>-	mutex_lock(&mem_sysfs_mutex);
>+	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
>+	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
> 
>-	mem = find_memory_block(section);
>-	if (mem) {
>-		mem->section_count++;
>-		put_device(&mem->dev);
>-	} else {
>-		ret = init_memory_block(&mem, section, MEM_OFFLINE);
>+	mutex_lock(&mem_sysfs_mutex);
>+	for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
>+		mem = find_memory_block(__pfn_to_section(pfn));
>+		if (mem) {
>+			WARN_ON_ONCE(false);

One question here, the purpose of WARN_ON_ONCE(false) is? Would we trigger
this?

>+			put_device(&mem->dev);
>+			continue;
>+		}
>+		ret = init_memory_block(&mem, __pfn_to_section(pfn),
>+					MEM_OFFLINE);
> 		if (ret)
>-			goto out;
>-		mem->section_count++;
>+			break;
>+		mem->section_count = memory_block_size_bytes() /
>+				     MIN_MEMORY_BLOCK_SIZE;

Maybe we can leverage sections_per_block variable.

                mem->section_count = sections_per_block;

>+	}
>+	if (ret) {
>+		end_pfn = pfn;
>+		for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
>+			mem = find_memory_block(__pfn_to_section(pfn));
>+			if (!mem)
>+				continue;
>+			mem->section_count = 0;
>+			unregister_memory(mem);
>+		}
> 	}
David Hildenbrand May 9, 2019, 2:05 p.m. UTC | #7
On 09.05.19 15:55, Wei Yang wrote:
> On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>> Only memory to be added to the buddy and to be onlined/offlined by
>> user space using memory block devices needs (and should have!) memory
>> block devices.
>>
>> Factor out creation of memory block devices Create all devices after
>> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>> because it is now effectively stale.
>>
>> Only after memory block devices have been added, memory can be onlined
>> by user space. This implies, that memory is not visible to user space at
>> all before arch_add_memory() succeeded.
>>
>> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Ingo Molnar <mingo@kernel.org>
>> Cc: Andrew Banman <andrew.banman@hpe.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>> Cc: Qian Cai <cai@lca.pw>
>> Cc: Wei Yang <richard.weiyang@gmail.com>
>> Cc: Arun KS <arunks@codeaurora.org>
>> Cc: Mathieu Malaterre <malat@debian.org>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
>> include/linux/memory.h |  2 +-
>> mm/memory_hotplug.c    | 15 ++++-----
>> 3 files changed, 53 insertions(+), 34 deletions(-)
>>
>> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>> index 6e0cb4fda179..862c202a18ca 100644
>> --- a/drivers/base/memory.c
>> +++ b/drivers/base/memory.c
>> @@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
>> 	return 0;
>> }
>>
>> +static void unregister_memory(struct memory_block *memory)
>> +{
>> +	BUG_ON(memory->dev.bus != &memory_subsys);
>> +
>> +	/* drop the ref. we got via find_memory_block() */
>> +	put_device(&memory->dev);
>> +	device_unregister(&memory->dev);
>> +}
>> +
>> /*
>> - * need an interface for the VM to add new memory regions,
>> - * but without onlining it.
>> + * Create memory block devices for the given memory area. Start and size
>> + * have to be aligned to memory block granularity. Memory block devices
>> + * will be initialized as offline.
>>  */
>> -int hotplug_memory_register(int nid, struct mem_section *section)
>> +int hotplug_memory_register(unsigned long start, unsigned long size)
>> {
>> -	int ret = 0;
>> +	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
>> +	unsigned long start_pfn = PFN_DOWN(start);
>> +	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
>> +	unsigned long pfn;
>> 	struct memory_block *mem;
>> +	int ret = 0;
>>
>> -	mutex_lock(&mem_sysfs_mutex);
>> +	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
>> +	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
>>
>> -	mem = find_memory_block(section);
>> -	if (mem) {
>> -		mem->section_count++;
>> -		put_device(&mem->dev);
>> -	} else {
>> -		ret = init_memory_block(&mem, section, MEM_OFFLINE);
>> +	mutex_lock(&mem_sysfs_mutex);
>> +	for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
>> +		mem = find_memory_block(__pfn_to_section(pfn));
>> +		if (mem) {
>> +			WARN_ON_ONCE(false);
> 
> One question here, the purpose of WARN_ON_ONCE(false) is? Would we trigger
> this?

Would happen if something goes terribly wrong. We might want to remove
this once we are sure this will not happen.

I replaced it in the meantime by a

if (WARN_ON_ONCE(mem)) {
	put_device(&mem->dev);
	ret = -EEXIST;
	break;
}

> 
>> +			put_device(&mem->dev);
>> +			continue;
>> +		}
>> +		ret = init_memory_block(&mem, __pfn_to_section(pfn),
>> +					MEM_OFFLINE);
>> 		if (ret)
>> -			goto out;
>> -		mem->section_count++;
>> +			break;
>> +		mem->section_count = memory_block_size_bytes() /
>> +				     MIN_MEMORY_BLOCK_SIZE;
> 
> Maybe we can leverage sections_per_block variable.

Most certainly if it does what I think it does :) thanks!
Wei Yang May 9, 2019, 2:31 p.m. UTC | #8
On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>Only memory to be added to the buddy and to be onlined/offlined by
>user space using memory block devices needs (and should have!) memory
>block devices.
>
>Factor out creation of memory block devices Create all devices after
>arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>because it is now effectively stale.
>
>Only after memory block devices have been added, memory can be onlined
>by user space. This implies, that memory is not visible to user space at
>all before arch_add_memory() succeeded.
>
>Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>Cc: David Hildenbrand <david@redhat.com>
>Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>Cc: Andrew Morton <akpm@linux-foundation.org>
>Cc: Ingo Molnar <mingo@kernel.org>
>Cc: Andrew Banman <andrew.banman@hpe.com>
>Cc: Oscar Salvador <osalvador@suse.de>
>Cc: Michal Hocko <mhocko@suse.com>
>Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>Cc: Qian Cai <cai@lca.pw>
>Cc: Wei Yang <richard.weiyang@gmail.com>
>Cc: Arun KS <arunks@codeaurora.org>
>Cc: Mathieu Malaterre <malat@debian.org>
>Signed-off-by: David Hildenbrand <david@redhat.com>
>---
> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
> include/linux/memory.h |  2 +-
> mm/memory_hotplug.c    | 15 ++++-----
> 3 files changed, 53 insertions(+), 34 deletions(-)
>
>diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>index 6e0cb4fda179..862c202a18ca 100644
>--- a/drivers/base/memory.c
>+++ b/drivers/base/memory.c
>@@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
> 	return 0;
> }
> 
>+static void unregister_memory(struct memory_block *memory)
>+{
>+	BUG_ON(memory->dev.bus != &memory_subsys);
>+
>+	/* drop the ref. we got via find_memory_block() */
>+	put_device(&memory->dev);
>+	device_unregister(&memory->dev);
>+}
>+
> /*
>- * need an interface for the VM to add new memory regions,
>- * but without onlining it.
>+ * Create memory block devices for the given memory area. Start and size
>+ * have to be aligned to memory block granularity. Memory block devices
>+ * will be initialized as offline.
>  */
>-int hotplug_memory_register(int nid, struct mem_section *section)
>+int hotplug_memory_register(unsigned long start, unsigned long size)

One trivial suggestion about the function name.

For memory_block device, sometimes we use the full name

    find_memory_block
    init_memory_block
    add_memory_block

But sometimes we use *nick* name

    hotplug_memory_register
    register_memory
    unregister_memory

This is a little bit confusion.

Can we use one name convention here? 

[...]

> /*
>@@ -1106,6 +1100,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
> 	if (ret < 0)
> 		goto error;
> 
>+	/* create memory block devices after memory was added */
>+	ret = hotplug_memory_register(start, size);
>+	if (ret) {
>+		arch_remove_memory(nid, start, size, NULL);

Functionally, it works I think.

But arch_remove_memory() would remove pages from zone. At this point, we just
allocate section/mmap for pages, the zones are empty and pages are not
connected to zone.

Function  zone = page_zone(page); always gets zone #0, since pages->flags is 0
at  this point. This is not exact.

Would we add some comment to mention this? Or we need to clean up
arch_remove_memory() to take out __remove_zone()?


>+		goto error;
>+	}
>+
> 	if (new_node) {
> 		/* If sysfs file of new node can't be created, cpu on the node
> 		 * can't be hot-added. There is no rollback way now.
>-- 
>2.20.1
David Hildenbrand May 9, 2019, 2:58 p.m. UTC | #9
On 09.05.19 16:31, Wei Yang wrote:
> On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>> Only memory to be added to the buddy and to be onlined/offlined by
>> user space using memory block devices needs (and should have!) memory
>> block devices.
>>
>> Factor out creation of memory block devices Create all devices after
>> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>> because it is now effectively stale.
>>
>> Only after memory block devices have been added, memory can be onlined
>> by user space. This implies, that memory is not visible to user space at
>> all before arch_add_memory() succeeded.
>>
>> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: Ingo Molnar <mingo@kernel.org>
>> Cc: Andrew Banman <andrew.banman@hpe.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Michal Hocko <mhocko@suse.com>
>> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>> Cc: Qian Cai <cai@lca.pw>
>> Cc: Wei Yang <richard.weiyang@gmail.com>
>> Cc: Arun KS <arunks@codeaurora.org>
>> Cc: Mathieu Malaterre <malat@debian.org>
>> Signed-off-by: David Hildenbrand <david@redhat.com>
>> ---
>> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
>> include/linux/memory.h |  2 +-
>> mm/memory_hotplug.c    | 15 ++++-----
>> 3 files changed, 53 insertions(+), 34 deletions(-)
>>
>> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>> index 6e0cb4fda179..862c202a18ca 100644
>> --- a/drivers/base/memory.c
>> +++ b/drivers/base/memory.c
>> @@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
>> 	return 0;
>> }
>>
>> +static void unregister_memory(struct memory_block *memory)
>> +{
>> +	BUG_ON(memory->dev.bus != &memory_subsys);
>> +
>> +	/* drop the ref. we got via find_memory_block() */
>> +	put_device(&memory->dev);
>> +	device_unregister(&memory->dev);
>> +}
>> +
>> /*
>> - * need an interface for the VM to add new memory regions,
>> - * but without onlining it.
>> + * Create memory block devices for the given memory area. Start and size
>> + * have to be aligned to memory block granularity. Memory block devices
>> + * will be initialized as offline.
>>  */
>> -int hotplug_memory_register(int nid, struct mem_section *section)
>> +int hotplug_memory_register(unsigned long start, unsigned long size)
> 
> One trivial suggestion about the function name.
> 
> For memory_block device, sometimes we use the full name
> 
>     find_memory_block
>     init_memory_block
>     add_memory_block
> 
> But sometimes we use *nick* name
> 
>     hotplug_memory_register
>     register_memory
>     unregister_memory
> 
> This is a little bit confusion.
> 
> Can we use one name convention here?

We can just go for

crate_memory_blocks() and free_memory_blocks(). Or do
you have better suggestions?

(I would actually even prefer "memory_block_devices", because memory
blocks have different meanins)

> 
> [...]
> 
>> /*
>> @@ -1106,6 +1100,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
>> 	if (ret < 0)
>> 		goto error;
>>
>> +	/* create memory block devices after memory was added */
>> +	ret = hotplug_memory_register(start, size);
>> +	if (ret) {
>> +		arch_remove_memory(nid, start, size, NULL);
> 
> Functionally, it works I think.
> 
> But arch_remove_memory() would remove pages from zone. At this point, we just
> allocate section/mmap for pages, the zones are empty and pages are not
> connected to zone.
> 
> Function  zone = page_zone(page); always gets zone #0, since pages->flags is 0
> at  this point. This is not exact.
> 
> Would we add some comment to mention this? Or we need to clean up
> arch_remove_memory() to take out __remove_zone()?

That is precisely what is on my list next (see cover letter).This is
already broken when memory that was never onlined is removed again.
So I am planning to fix that independently.
Wei Yang May 9, 2019, 9:50 p.m. UTC | #10
On Thu, May 09, 2019 at 04:58:56PM +0200, David Hildenbrand wrote:
>On 09.05.19 16:31, Wei Yang wrote:
>> On Tue, May 07, 2019 at 08:38:00PM +0200, David Hildenbrand wrote:
>>> Only memory to be added to the buddy and to be onlined/offlined by
>>> user space using memory block devices needs (and should have!) memory
>>> block devices.
>>>
>>> Factor out creation of memory block devices Create all devices after
>>> arch_add_memory() succeeded. We can later drop the want_memblock parameter,
>>> because it is now effectively stale.
>>>
>>> Only after memory block devices have been added, memory can be onlined
>>> by user space. This implies, that memory is not visible to user space at
>>> all before arch_add_memory() succeeded.
>>>
>>> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
>>> Cc: "Rafael J. Wysocki" <rafael@kernel.org>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: "mike.travis@hpe.com" <mike.travis@hpe.com>
>>> Cc: Andrew Morton <akpm@linux-foundation.org>
>>> Cc: Ingo Molnar <mingo@kernel.org>
>>> Cc: Andrew Banman <andrew.banman@hpe.com>
>>> Cc: Oscar Salvador <osalvador@suse.de>
>>> Cc: Michal Hocko <mhocko@suse.com>
>>> Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
>>> Cc: Qian Cai <cai@lca.pw>
>>> Cc: Wei Yang <richard.weiyang@gmail.com>
>>> Cc: Arun KS <arunks@codeaurora.org>
>>> Cc: Mathieu Malaterre <malat@debian.org>
>>> Signed-off-by: David Hildenbrand <david@redhat.com>
>>> ---
>>> drivers/base/memory.c  | 70 ++++++++++++++++++++++++++----------------
>>> include/linux/memory.h |  2 +-
>>> mm/memory_hotplug.c    | 15 ++++-----
>>> 3 files changed, 53 insertions(+), 34 deletions(-)
>>>
>>> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
>>> index 6e0cb4fda179..862c202a18ca 100644
>>> --- a/drivers/base/memory.c
>>> +++ b/drivers/base/memory.c
>>> @@ -701,44 +701,62 @@ static int add_memory_block(int base_section_nr)
>>> 	return 0;
>>> }
>>>
>>> +static void unregister_memory(struct memory_block *memory)
>>> +{
>>> +	BUG_ON(memory->dev.bus != &memory_subsys);
>>> +
>>> +	/* drop the ref. we got via find_memory_block() */
>>> +	put_device(&memory->dev);
>>> +	device_unregister(&memory->dev);
>>> +}
>>> +
>>> /*
>>> - * need an interface for the VM to add new memory regions,
>>> - * but without onlining it.
>>> + * Create memory block devices for the given memory area. Start and size
>>> + * have to be aligned to memory block granularity. Memory block devices
>>> + * will be initialized as offline.
>>>  */
>>> -int hotplug_memory_register(int nid, struct mem_section *section)
>>> +int hotplug_memory_register(unsigned long start, unsigned long size)
>> 
>> One trivial suggestion about the function name.
>> 
>> For memory_block device, sometimes we use the full name
>> 
>>     find_memory_block
>>     init_memory_block
>>     add_memory_block
>> 
>> But sometimes we use *nick* name
>> 
>>     hotplug_memory_register
>>     register_memory
>>     unregister_memory
>> 
>> This is a little bit confusion.
>> 
>> Can we use one name convention here?
>
>We can just go for
>
>crate_memory_blocks() and free_memory_blocks(). Or do
>you have better suggestions?

s/crate/create/

Looks good to me.

>
>(I would actually even prefer "memory_block_devices", because memory
>blocks have different meanins)
>

Agree with you, this comes to my mind sometime ago :-)

>> 
>> [...]
>> 
>>> /*
>>> @@ -1106,6 +1100,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
>>> 	if (ret < 0)
>>> 		goto error;
>>>
>>> +	/* create memory block devices after memory was added */
>>> +	ret = hotplug_memory_register(start, size);
>>> +	if (ret) {
>>> +		arch_remove_memory(nid, start, size, NULL);
>> 
>> Functionally, it works I think.
>> 
>> But arch_remove_memory() would remove pages from zone. At this point, we just
>> allocate section/mmap for pages, the zones are empty and pages are not
>> connected to zone.
>> 
>> Function  zone = page_zone(page); always gets zone #0, since pages->flags is 0
>> at  this point. This is not exact.
>> 
>> Would we add some comment to mention this? Or we need to clean up
>> arch_remove_memory() to take out __remove_zone()?
>
>That is precisely what is on my list next (see cover letter).This is
>already broken when memory that was never onlined is removed again.
>So I am planning to fix that independently.
>

Sounds great :-)

Hope you would cc me in the following series.

>
>-- 
>
>Thanks,
>
>David / dhildenb
David Hildenbrand May 9, 2019, 10:18 p.m. UTC | #11
> Looks good to me.
> 
>>
>> (I would actually even prefer "memory_block_devices", because memory
>> blocks have different meanins)
>>
> 
> Agree with you, this comes to my mind sometime ago :-)

We have memblocks, memory_blocks  ... I guess memory_block_device is
unique :)

> 
>>>
>>> [...]
>>>
>>>> /*
>>>> @@ -1106,6 +1100,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
>>>> 	if (ret < 0)
>>>> 		goto error;
>>>>
>>>> +	/* create memory block devices after memory was added */
>>>> +	ret = hotplug_memory_register(start, size);
>>>> +	if (ret) {
>>>> +		arch_remove_memory(nid, start, size, NULL);
>>>
>>> Functionally, it works I think.
>>>
>>> But arch_remove_memory() would remove pages from zone. At this point, we just
>>> allocate section/mmap for pages, the zones are empty and pages are not
>>> connected to zone.
>>>
>>> Function  zone = page_zone(page); always gets zone #0, since pages->flags is 0
>>> at  this point. This is not exact.
>>>
>>> Would we add some comment to mention this? Or we need to clean up
>>> arch_remove_memory() to take out __remove_zone()?
>>
>> That is precisely what is on my list next (see cover letter).This is
>> already broken when memory that was never onlined is removed again.
>> So I am planning to fix that independently.
>>
> 
> Sounds great :-)

Especially, I suspect a lot of bugs in the area of

1. Remove memory that has never been onlined
2. Remove memory that has been onlined/offlined a couple of times
3. Remove memory that has been onlined to different zones.

Will see when refactoring if my intuition is right :)

> 
> Hope you would cc me in the following series.


Sure! Thanks!
diff mbox series

Patch

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 6e0cb4fda179..862c202a18ca 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -701,44 +701,62 @@  static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
+static void unregister_memory(struct memory_block *memory)
+{
+	BUG_ON(memory->dev.bus != &memory_subsys);
+
+	/* drop the ref. we got via find_memory_block() */
+	put_device(&memory->dev);
+	device_unregister(&memory->dev);
+}
+
 /*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
+ * Create memory block devices for the given memory area. Start and size
+ * have to be aligned to memory block granularity. Memory block devices
+ * will be initialized as offline.
  */
-int hotplug_memory_register(int nid, struct mem_section *section)
+int hotplug_memory_register(unsigned long start, unsigned long size)
 {
-	int ret = 0;
+	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
+	unsigned long start_pfn = PFN_DOWN(start);
+	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
+	unsigned long pfn;
 	struct memory_block *mem;
+	int ret = 0;
 
-	mutex_lock(&mem_sysfs_mutex);
+	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
+	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
 
-	mem = find_memory_block(section);
-	if (mem) {
-		mem->section_count++;
-		put_device(&mem->dev);
-	} else {
-		ret = init_memory_block(&mem, section, MEM_OFFLINE);
+	mutex_lock(&mem_sysfs_mutex);
+	for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
+		mem = find_memory_block(__pfn_to_section(pfn));
+		if (mem) {
+			WARN_ON_ONCE(false);
+			put_device(&mem->dev);
+			continue;
+		}
+		ret = init_memory_block(&mem, __pfn_to_section(pfn),
+					MEM_OFFLINE);
 		if (ret)
-			goto out;
-		mem->section_count++;
+			break;
+		mem->section_count = memory_block_size_bytes() /
+				     MIN_MEMORY_BLOCK_SIZE;
+	}
+	if (ret) {
+		end_pfn = pfn;
+		for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
+			mem = find_memory_block(__pfn_to_section(pfn));
+			if (!mem)
+				continue;
+			mem->section_count = 0;
+			unregister_memory(mem);
+		}
 	}
-
-out:
 	mutex_unlock(&mem_sysfs_mutex);
 	return ret;
 }
 
-static void
-unregister_memory(struct memory_block *memory)
-{
-	BUG_ON(memory->dev.bus != &memory_subsys);
-
-	/* drop the ref. we got via find_memory_block() */
-	put_device(&memory->dev);
-	device_unregister(&memory->dev);
-}
-
-void unregister_memory_section(struct mem_section *section)
+static int remove_memory_section(struct mem_section *section)
 {
 	struct memory_block *mem;
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 474c7c60c8f2..95505fbb5f85 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -111,7 +111,7 @@  extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
-int hotplug_memory_register(int nid, struct mem_section *section);
+int hotplug_memory_register(unsigned long start, unsigned long size);
 extern void unregister_memory_section(struct mem_section *);
 extern int memory_dev_init(void);
 extern int memory_notify(unsigned long val, void *v);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7b5439839d67..e1637c8a0723 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -258,13 +258,7 @@  static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 		return -EEXIST;
 
 	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
-	if (ret < 0)
-		return ret;
-
-	if (!want_memblock)
-		return 0;
-
-	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+	return ret < 0 ? ret : 0;
 }
 
 /*
@@ -1106,6 +1100,13 @@  int __ref add_memory_resource(int nid, struct resource *res)
 	if (ret < 0)
 		goto error;
 
+	/* create memory block devices after memory was added */
+	ret = hotplug_memory_register(start, size);
+	if (ret) {
+		arch_remove_memory(nid, start, size, NULL);
+		goto error;
+	}
+
 	if (new_node) {
 		/* If sysfs file of new node can't be created, cpu on the node
 		 * can't be hot-added. There is no rollback way now.