diff mbox series

[RFC,2/3] mm/memory_hotplug: Create memory block devices after arch_add_memory()

Message ID 20190408101226.20976-3-david@redhat.com (mailing list archive)
State New, archived
Headers show
Series mm/memory_hotplug: Factor out memory block device handling | expand

Commit Message

David Hildenbrand April 8, 2019, 10:12 a.m. UTC
Only memory added via add_memory() and friends will need memory
block devices - only memory to be used via the buddy and to be onlined/
offlined by user space in memory block granularity.

Move creation of memory block devices out of arch_add_memory(). Create all
devices after arch_add_memory() succeeded. We can later drop the
want_memblock parameter, because it is now effectively stale.

Only after memory block devices have been added, memory can be onlined
by user space. This implies, that memory is not visible to user space at
all before arch_add_memory() succeeded.

Issue 1: __add_pages() does not remove pages in case something went
wrong. If this is the case, we would now no longer create memory block
devices for such "partially added memory". So the memory would not be
usable/onlinable. Bad? Or related to issue 2 (e.g. fix __add_pages()
to remove any parts that were added in case of an error). Functions that
fail and don't clean up are not that nice.

Issue 2: In case we can't add memory block devices, and we don't have
HOTREMOVE, we can't remove the pages via arch_remove_pages. Maybe we should
try to get rid of CONFIG_MEMORY_HOTREMOVE, so we can handle all failures
in a nice way? Or at least allow arch_remove_pages() and friends, so a
subset of CONFIG_MEMORY_HOTREMOVE.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 drivers/base/memory.c  | 67 +++++++++++++++++++++++++-----------------
 include/linux/memory.h |  2 +-
 mm/memory_hotplug.c    | 17 +++++++----
 3 files changed, 53 insertions(+), 33 deletions(-)

Comments

David Hildenbrand April 9, 2019, 7:33 a.m. UTC | #1
On 08.04.19 12:12, David Hildenbrand wrote:
> Only memory added via add_memory() and friends will need memory
> block devices - only memory to be used via the buddy and to be onlined/
> offlined by user space in memory block granularity.
> 
> Move creation of memory block devices out of arch_add_memory(). Create all
> devices after arch_add_memory() succeeded. We can later drop the
> want_memblock parameter, because it is now effectively stale.
> 
> Only after memory block devices have been added, memory can be onlined
> by user space. This implies, that memory is not visible to user space at
> all before arch_add_memory() succeeded.
> 
> Issue 1: __add_pages() does not remove pages in case something went
> wrong. If this is the case, we would now no longer create memory block
> devices for such "partially added memory". So the memory would not be
> usable/onlinable. Bad? Or related to issue 2 (e.g. fix __add_pages()
> to remove any parts that were added in case of an error). Functions that
> fail and don't clean up are not that nice.
> 
> Issue 2: In case we can't add memory block devices, and we don't have
> HOTREMOVE, we can't remove the pages via arch_remove_pages. Maybe we should
> try to get rid of CONFIG_MEMORY_HOTREMOVE, so we can handle all failures
> in a nice way? Or at least allow arch_remove_pages() and friends, so a
> subset of CONFIG_MEMORY_HOTREMOVE.
> 
> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  drivers/base/memory.c  | 67 +++++++++++++++++++++++++-----------------
>  include/linux/memory.h |  2 +-
>  mm/memory_hotplug.c    | 17 +++++++----
>  3 files changed, 53 insertions(+), 33 deletions(-)
> 
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index d9ebb89816f7..847b33061e2e 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -701,44 +701,57 @@ static int add_memory_block(int base_section_nr)
>  	return 0;
>  }
>  
> -/*
> - * need an interface for the VM to add new memory regions,
> - * but without onlining it.
> - */
> -int hotplug_memory_register(int nid, struct mem_section *section)
> +static void unregister_memory(struct memory_block *memory)
>  {
> -	int ret = 0;
> +	BUG_ON(memory->dev.bus != &memory_subsys);
> +
> +	/* drop the ref. we got via find_memory_block() */
> +	put_device(&memory->dev);
> +	device_unregister(&memory->dev);
> +}
> +
> +int hotplug_memory_register(unsigned long start, unsigned long size)
> +{
> +	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
> +	unsigned long start_pfn = PFN_DOWN(start);
> +	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
> +	unsigned long pfn;
>  	struct memory_block *mem;
> +	int ret = 0;
>  
> -	mutex_lock(&mem_sysfs_mutex);
> +	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
> +	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
>  
> -	mem = find_memory_block(section);
> -	if (mem) {
> -		mem->section_count++;
> -		put_device(&mem->dev);
> -	} else {
> -		ret = init_memory_block(&mem, section, MEM_OFFLINE);
> +	mutex_lock(&mem_sysfs_mutex);
> +	for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
> +		mem = find_memory_block(__pfn_to_section(pfn));
> +		if (mem) {
> +			WARN_ON_ONCE(false);
> +			put_device(&mem->dev);
> +			continue;
> +		}
> +		ret = init_memory_block(&mem, __pfn_to_section(pfn),
> +					MEM_OFFLINE);
>  		if (ret)
> -			goto out;
> -		mem->section_count++;
> +			break;
> +		mem->section_count = memory_block_size_bytes() /
> +				     MIN_MEMORY_BLOCK_SIZE;
> +	}
> +	if (ret) {
> +		end_pfn = pfn;
> +		for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
> +			mem = find_memory_block(__pfn_to_section(pfn));
> +			if (!mem)
> +				continue;
> +			mem->section_count = 0;
> +			unregister_memory(mem);
> +		}
>  	}
> -
> -out:
>  	mutex_unlock(&mem_sysfs_mutex);
>  	return ret;
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> -static void
> -unregister_memory(struct memory_block *memory)
> -{
> -	BUG_ON(memory->dev.bus != &memory_subsys);
> -
> -	/* drop the ref. we got in remove_memory_section() */
> -	put_device(&memory->dev);
> -	device_unregister(&memory->dev);
> -}
> -
>  static int remove_memory_section(struct mem_section *section)
>  {
>  	struct memory_block *mem;
> diff --git a/include/linux/memory.h b/include/linux/memory.h
> index a6ddefc60517..e275dc775834 100644
> --- a/include/linux/memory.h
> +++ b/include/linux/memory.h
> @@ -111,7 +111,7 @@ extern int register_memory_notifier(struct notifier_block *nb);
>  extern void unregister_memory_notifier(struct notifier_block *nb);
>  extern int register_memory_isolate_notifier(struct notifier_block *nb);
>  extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
> -int hotplug_memory_register(int nid, struct mem_section *section);
> +int hotplug_memory_register(unsigned long start, unsigned long size);
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  extern int unregister_memory_section(struct mem_section *);
>  #endif
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 680dcc67f9d5..13ee0a26e034 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -260,11 +260,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
>  	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
>  	if (ret < 0)
>  		return ret;
> -
> -	if (!want_memblock)
> -		return 0;
> -
> -	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
> +	return 0;
>  }
>  
>  /*
> @@ -1125,6 +1121,17 @@ int __ref add_memory_resource(int nid, struct resource *res)
>  	if (ret < 0)
>  		goto error;
>  
> +	/* create memory block devices after memory was added */
> +	ret = hotplug_memory_register(start, size);
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +	if (ret) {
> +		arch_remove_memory(nid, start, size, NULL);
> +		goto error;
> +	}
> +#else
> +	WARN_ON(ret);
> +#endif
> +
>  	if (new_node) {
>  		/* If sysfs file of new node can't be created, cpu on the node
>  		 * can't be hot-added. There is no rollback way now.
> 

FWIW, I think we should first try to make sure arch_remove_memory()
cannot fail / will not ignore errors if possible. There are still some
things in there that need more re-factoring first.
diff mbox series

Patch

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index d9ebb89816f7..847b33061e2e 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -701,44 +701,57 @@  static int add_memory_block(int base_section_nr)
 	return 0;
 }
 
-/*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
- */
-int hotplug_memory_register(int nid, struct mem_section *section)
+static void unregister_memory(struct memory_block *memory)
 {
-	int ret = 0;
+	BUG_ON(memory->dev.bus != &memory_subsys);
+
+	/* drop the ref. we got via find_memory_block() */
+	put_device(&memory->dev);
+	device_unregister(&memory->dev);
+}
+
+int hotplug_memory_register(unsigned long start, unsigned long size)
+{
+	unsigned long block_nr_pages = memory_block_size_bytes() >> PAGE_SHIFT;
+	unsigned long start_pfn = PFN_DOWN(start);
+	unsigned long end_pfn = start_pfn + (size >> PAGE_SHIFT);
+	unsigned long pfn;
 	struct memory_block *mem;
+	int ret = 0;
 
-	mutex_lock(&mem_sysfs_mutex);
+	BUG_ON(!IS_ALIGNED(start, memory_block_size_bytes()));
+	BUG_ON(!IS_ALIGNED(size, memory_block_size_bytes()));
 
-	mem = find_memory_block(section);
-	if (mem) {
-		mem->section_count++;
-		put_device(&mem->dev);
-	} else {
-		ret = init_memory_block(&mem, section, MEM_OFFLINE);
+	mutex_lock(&mem_sysfs_mutex);
+	for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
+		mem = find_memory_block(__pfn_to_section(pfn));
+		if (mem) {
+			WARN_ON_ONCE(false);
+			put_device(&mem->dev);
+			continue;
+		}
+		ret = init_memory_block(&mem, __pfn_to_section(pfn),
+					MEM_OFFLINE);
 		if (ret)
-			goto out;
-		mem->section_count++;
+			break;
+		mem->section_count = memory_block_size_bytes() /
+				     MIN_MEMORY_BLOCK_SIZE;
+	}
+	if (ret) {
+		end_pfn = pfn;
+		for (pfn = start_pfn; pfn != end_pfn; pfn += block_nr_pages) {
+			mem = find_memory_block(__pfn_to_section(pfn));
+			if (!mem)
+				continue;
+			mem->section_count = 0;
+			unregister_memory(mem);
+		}
 	}
-
-out:
 	mutex_unlock(&mem_sysfs_mutex);
 	return ret;
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static void
-unregister_memory(struct memory_block *memory)
-{
-	BUG_ON(memory->dev.bus != &memory_subsys);
-
-	/* drop the ref. we got in remove_memory_section() */
-	put_device(&memory->dev);
-	device_unregister(&memory->dev);
-}
-
 static int remove_memory_section(struct mem_section *section)
 {
 	struct memory_block *mem;
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a6ddefc60517..e275dc775834 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -111,7 +111,7 @@  extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
-int hotplug_memory_register(int nid, struct mem_section *section);
+int hotplug_memory_register(unsigned long start, unsigned long size);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern int unregister_memory_section(struct mem_section *);
 #endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 680dcc67f9d5..13ee0a26e034 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -260,11 +260,7 @@  static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
 	ret = sparse_add_one_section(nid, phys_start_pfn, altmap);
 	if (ret < 0)
 		return ret;
-
-	if (!want_memblock)
-		return 0;
-
-	return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+	return 0;
 }
 
 /*
@@ -1125,6 +1121,17 @@  int __ref add_memory_resource(int nid, struct resource *res)
 	if (ret < 0)
 		goto error;
 
+	/* create memory block devices after memory was added */
+	ret = hotplug_memory_register(start, size);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+	if (ret) {
+		arch_remove_memory(nid, start, size, NULL);
+		goto error;
+	}
+#else
+	WARN_ON(ret);
+#endif
+
 	if (new_node) {
 		/* If sysfs file of new node can't be created, cpu on the node
 		 * can't be hot-added. There is no rollback way now.