diff mbox series

[v5,6/7] mm/hotplug: Embed vmem_altmap details in memory block

Message ID 20230725100212.531277-7-aneesh.kumar@linux.ibm.com (mailing list archive)
State New
Headers show
Series Add support for memmap on memory feature on ppc64 | expand

Commit Message

Aneesh Kumar K.V July 25, 2023, 10:02 a.m. UTC
With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block, embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
 drivers/base/memory.c  | 32 +++++++++++++++++++++++---------
 include/linux/memory.h |  8 ++------
 mm/memory_hotplug.c    | 41 ++++++++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 34 deletions(-)

Comments

David Hildenbrand July 26, 2023, 9:41 a.m. UTC | #1
On 25.07.23 12:02, Aneesh Kumar K.V wrote:
> With memmap on memory, some architecture needs more details w.r.t altmap
> such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
> computing them again when we remove a memory block, embed vmem_altmap
> details in struct memory_block if we are using memmap on memory block
> feature.
> 
> No functional change in this patch
> 
> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
> ---

[...]

>   
>   static int add_memory_block(unsigned long block_id, unsigned long state,
> -			    unsigned long nr_vmemmap_pages,
> +			    struct vmem_altmap *altmap,
>   			    struct memory_group *group)
>   {
>   	struct memory_block *mem;
> @@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, unsigned long state,
>   	mem->start_section_nr = block_id * sections_per_block;
>   	mem->state = state;
>   	mem->nid = NUMA_NO_NODE;
> -	mem->nr_vmemmap_pages = nr_vmemmap_pages;
> +	if (altmap) {
> +		mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
> +		if (!mem->altmap) {
> +			kfree(mem);
> +			return -ENOMEM;
> +		}
> +		memcpy(mem->altmap, altmap, sizeof(*altmap));
> +	}

I'm wondering if we should instead let the caller do the alloc/free. So we would alloc
int the caller and would only store the pointer.

Before removing the memory block, we would clear the pointer and free it in the caller.

IOW, when removing a memory block and we still have an altmap set, something would be wrong.

See below on try_remove_memory() handling.

[...]

> -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
> +static int get_vmemmap_altmap_cb(struct memory_block *mem, void *arg)
>   {
> +	struct vmem_altmap *altmap = (struct vmem_altmap *)arg;
>   	/*
> -	 * If not set, continue with the next block.
> +	 * If we have any pages allocated from altmap
> +	 * return the altmap details and break callback.
>   	 */
> -	return mem->nr_vmemmap_pages;
> +	if (mem->altmap) {
> +		memcpy(altmap, mem->altmap, sizeof(struct vmem_altmap));
> +		return 1;
> +	}
> +	return 0;
>   }
>   
>   static int check_cpu_on_node(int nid)
> @@ -2146,9 +2152,8 @@ EXPORT_SYMBOL(try_offline_node);
>   
>   static int __ref try_remove_memory(u64 start, u64 size)
>   {
> -	struct vmem_altmap mhp_altmap = {};
> -	struct vmem_altmap *altmap = NULL;
> -	unsigned long nr_vmemmap_pages;
> +	int ret;
> +	struct vmem_altmap mhp_altmap, *altmap = NULL;
>   	int rc = 0, nid = NUMA_NO_NODE;
>   
>   	BUG_ON(check_hotplug_memory_range(start, size));
> @@ -2171,24 +2176,15 @@ static int __ref try_remove_memory(u64 start, u64 size)
>   	 * the same granularity it was added - a single memory block.
>   	 */
>   	if (mhp_memmap_on_memory()) {
> -		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
> -						      get_nr_vmemmap_pages_cb);
> -		if (nr_vmemmap_pages) {
> +		ret = walk_memory_blocks(start, size, &mhp_altmap,
> +					 get_vmemmap_altmap_cb);
> +		if (ret) {
>   			if (size != memory_block_size_bytes()) {
>   				pr_warn("Refuse to remove %#llx - %#llx,"
>   					"wrong granularity\n",
>   					start, start + size);
>   				return -EINVAL;
>   			}
> -
> -			/*
> -			 * Let remove_pmd_table->free_hugepage_table do the
> -			 * right thing if we used vmem_altmap when hot-adding
> -			 * the range.
> -			 */
> -			mhp_altmap.base_pfn = PHYS_PFN(start);
> -			mhp_altmap.free = nr_vmemmap_pages;
> -			mhp_altmap.alloc = nr_vmemmap_pages;
>   			altmap = &mhp_altmap;
>   		}


Instead of that, I suggest (whitespace damage expected):

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 3f231cf1b410..f6860df64549 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1956,12 +1956,19 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
         return 0;
  }
  
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+static int test_has_altmap_cb(struct memory_block *mem, void *arg)
  {
-       /*
-        * If not set, continue with the next block.
-        */
-       return mem->nr_vmemmap_pages;
+       struct memory_block **mem_ptr = (struct memory_block **)arg;
+
+       if (mem->altmap) {
+               /*
+                * We're not taking a reference on the memory block; it
+                * it cannot vanish while we're about to that memory ourselves.
+                */
+               *mem_ptr = mem;
+               return 1;
+       }
+       return 0;
  }
  
  static int check_cpu_on_node(int nid)
@@ -2036,9 +2043,7 @@ EXPORT_SYMBOL(try_offline_node);
  
  static int __ref try_remove_memory(u64 start, u64 size)
  {
-       struct vmem_altmap mhp_altmap = {};
         struct vmem_altmap *altmap = NULL;
-       unsigned long nr_vmemmap_pages;
         int rc = 0, nid = NUMA_NO_NODE;
  
         BUG_ON(check_hotplug_memory_range(start, size));
@@ -2061,9 +2066,9 @@ static int __ref try_remove_memory(u64 start, u64 size)
          * the same granularity it was added - a single memory block.
          */
         if (mhp_memmap_on_memory()) {
-               nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
-                                                     get_nr_vmemmap_pages_cb);
-               if (nr_vmemmap_pages) {
+               struct memory_block *mem;
+
+               if (walk_memory_blocks(start, size, &mem, test_has_altmap_cb)) {
                         if (size != memory_block_size_bytes()) {
                                 pr_warn("Refuse to remove %#llx - %#llx,"
                                         "wrong granularity\n",
@@ -2072,12 +2077,11 @@ static int __ref try_remove_memory(u64 start, u64 size)
                         }
  
                         /*
-                        * Let remove_pmd_table->free_hugepage_table do the
-                        * right thing if we used vmem_altmap when hot-adding
-                        * the range.
+                        * Clear the altmap from the memory block before we
+                        * remove it; we'll take care of freeing the altmap.
                          */
-                       mhp_altmap.alloc = nr_vmemmap_pages;
-                       altmap = &mhp_altmap;
+                       altmap = mem->altmap;
+                       mem->altmap = NULL;
                 }
         }
  
@@ -2094,6 +2098,9 @@ static int __ref try_remove_memory(u64 start, u64 size)
  
         arch_remove_memory(start, size, altmap);
  
+       if (altmap)
+               kfree(altmap);
+
         if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
                 memblock_phys_free(start, size);
                 memblock_remove(start, size);
Aneesh Kumar K.V July 26, 2023, 10:31 a.m. UTC | #2
David Hildenbrand <david@redhat.com> writes:

> On 25.07.23 12:02, Aneesh Kumar K.V wrote:
>> With memmap on memory, some architecture needs more details w.r.t altmap
>> such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
>> computing them again when we remove a memory block, embed vmem_altmap
>> details in struct memory_block if we are using memmap on memory block
>> feature.
>> 
>> No functional change in this patch
>> 
>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>> ---
>
> [...]
>
>>   
>>   static int add_memory_block(unsigned long block_id, unsigned long state,
>> -			    unsigned long nr_vmemmap_pages,
>> +			    struct vmem_altmap *altmap,
>>   			    struct memory_group *group)
>>   {
>>   	struct memory_block *mem;
>> @@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, unsigned long state,
>>   	mem->start_section_nr = block_id * sections_per_block;
>>   	mem->state = state;
>>   	mem->nid = NUMA_NO_NODE;
>> -	mem->nr_vmemmap_pages = nr_vmemmap_pages;
>> +	if (altmap) {
>> +		mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
>> +		if (!mem->altmap) {
>> +			kfree(mem);
>> +			return -ENOMEM;
>> +		}
>> +		memcpy(mem->altmap, altmap, sizeof(*altmap));
>> +	}
>
> I'm wondering if we should instead let the caller do the alloc/free. So we would alloc
> int the caller and would only store the pointer.
>
> Before removing the memory block, we would clear the pointer and free it in the caller.
>
> IOW, when removing a memory block and we still have an altmap set, something would be wrong.
>
> See below on try_remove_memory() handling.
>
> [...]
>
>> -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
>> +static int get_vmemmap_altmap_cb(struct memory_block *mem, void *arg)
>>   {
>> +	struct vmem_altmap *altmap = (struct vmem_altmap *)arg;
>>   	/*
>> -	 * If not set, continue with the next block.
>> +	 * If we have any pages allocated from altmap
>> +	 * return the altmap details and break callback.
>>   	 */
>> -	return mem->nr_vmemmap_pages;
>> +	if (mem->altmap) {
>> +		memcpy(altmap, mem->altmap, sizeof(struct vmem_altmap));
>> +		return 1;
>> +	}
>> +	return 0;
>>   }
>>   
>>   static int check_cpu_on_node(int nid)
>> @@ -2146,9 +2152,8 @@ EXPORT_SYMBOL(try_offline_node);
>>   
>>   static int __ref try_remove_memory(u64 start, u64 size)
>>   {
>> -	struct vmem_altmap mhp_altmap = {};
>> -	struct vmem_altmap *altmap = NULL;
>> -	unsigned long nr_vmemmap_pages;
>> +	int ret;
>> +	struct vmem_altmap mhp_altmap, *altmap = NULL;
>>   	int rc = 0, nid = NUMA_NO_NODE;
>>   
>>   	BUG_ON(check_hotplug_memory_range(start, size));
>> @@ -2171,24 +2176,15 @@ static int __ref try_remove_memory(u64 start, u64 size)
>>   	 * the same granularity it was added - a single memory block.
>>   	 */
>>   	if (mhp_memmap_on_memory()) {
>> -		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
>> -						      get_nr_vmemmap_pages_cb);
>> -		if (nr_vmemmap_pages) {
>> +		ret = walk_memory_blocks(start, size, &mhp_altmap,
>> +					 get_vmemmap_altmap_cb);
>> +		if (ret) {
>>   			if (size != memory_block_size_bytes()) {
>>   				pr_warn("Refuse to remove %#llx - %#llx,"
>>   					"wrong granularity\n",
>>   					start, start + size);
>>   				return -EINVAL;
>>   			}
>> -
>> -			/*
>> -			 * Let remove_pmd_table->free_hugepage_table do the
>> -			 * right thing if we used vmem_altmap when hot-adding
>> -			 * the range.
>> -			 */
>> -			mhp_altmap.base_pfn = PHYS_PFN(start);
>> -			mhp_altmap.free = nr_vmemmap_pages;
>> -			mhp_altmap.alloc = nr_vmemmap_pages;
>>   			altmap = &mhp_altmap;
>>   		}
>
>
> Instead of that, I suggest (whitespace damage expected):
>
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 3f231cf1b410..f6860df64549 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1956,12 +1956,19 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
>          return 0;
>   }
>   
> -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
> +static int test_has_altmap_cb(struct memory_block *mem, void *arg)
>   {
> -       /*
> -        * If not set, continue with the next block.
> -        */
> -       return mem->nr_vmemmap_pages;
> +       struct memory_block **mem_ptr = (struct memory_block **)arg;
> +
> +       if (mem->altmap) {
> +               /*
> +                * We're not taking a reference on the memory block; it
> +                * it cannot vanish while we're about to that memory ourselves.
> +                */
> +               *mem_ptr = mem;
> +               return 1;
> +       }
> +       return 0;
>   }
>   
>   static int check_cpu_on_node(int nid)
> @@ -2036,9 +2043,7 @@ EXPORT_SYMBOL(try_offline_node);
>   
>   static int __ref try_remove_memory(u64 start, u64 size)
>   {
> -       struct vmem_altmap mhp_altmap = {};
>          struct vmem_altmap *altmap = NULL;
> -       unsigned long nr_vmemmap_pages;
>          int rc = 0, nid = NUMA_NO_NODE;
>   
>          BUG_ON(check_hotplug_memory_range(start, size));
> @@ -2061,9 +2066,9 @@ static int __ref try_remove_memory(u64 start, u64 size)
>           * the same granularity it was added - a single memory block.
>           */
>          if (mhp_memmap_on_memory()) {
> -               nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
> -                                                     get_nr_vmemmap_pages_cb);
> -               if (nr_vmemmap_pages) {
> +               struct memory_block *mem;
> +
> +               if (walk_memory_blocks(start, size, &mem, test_has_altmap_cb)) {
>                          if (size != memory_block_size_bytes()) {
>                                  pr_warn("Refuse to remove %#llx - %#llx,"
>                                          "wrong granularity\n",
> @@ -2072,12 +2077,11 @@ static int __ref try_remove_memory(u64 start, u64 size)
>                          }
>   
>                          /*
> -                        * Let remove_pmd_table->free_hugepage_table do the
> -                        * right thing if we used vmem_altmap when hot-adding
> -                        * the range.
> +                        * Clear the altmap from the memory block before we
> +                        * remove it; we'll take care of freeing the altmap.
>                           */
> -                       mhp_altmap.alloc = nr_vmemmap_pages;
> -                       altmap = &mhp_altmap;
> +                       altmap = mem->altmap;
> +                       mem->altmap = NULL;
>                  }
>          }
>   
> @@ -2094,6 +2098,9 @@ static int __ref try_remove_memory(u64 start, u64 size)
>   
>          arch_remove_memory(start, size, altmap);
>   
> +       if (altmap)
> +               kfree(altmap);
> +
>          if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
>                  memblock_phys_free(start, size);
>                  memblock_remove(start, size);
>

Is this any better. Any specific reason we want the alloc and free in
the caller? 

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 0210ed7b7696..271cfdf8f6b6 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -106,7 +106,7 @@ static void memory_block_release(struct device *dev)
 {
 	struct memory_block *mem = to_memory_block(dev);
 
-	kfree(mem->altmap);
+	WARN_ON(mem->altmap);
 	kfree(mem);
 }
 
@@ -751,14 +751,8 @@ static int add_memory_block(unsigned long block_id, unsigned long state,
 	mem->start_section_nr = block_id * sections_per_block;
 	mem->state = state;
 	mem->nid = NUMA_NO_NODE;
-	if (altmap) {
-		mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
-		if (!mem->altmap) {
-			kfree(mem);
-			return -ENOMEM;
-		}
-		memcpy(mem->altmap, altmap, sizeof(*altmap));
-	}
+	if (altmap)
+		mem->altmap = altmap;
 	INIT_LIST_HEAD(&mem->group_next);
 
 #ifndef CONFIG_NUMA
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 2bad1bf0e9e3..1c7d88332e0e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1445,8 +1445,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 	 */
 	if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
 		if (mhp_supports_memmap_on_memory(size)) {
+
 			mhp_altmap.free = memory_block_memmap_on_memory_pages();
-			params.altmap = &mhp_altmap;
+			params.altmap = kzalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+			if (!params.altmap)
+				goto error;
+
+			memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap));
 		}
 		/* fallback to not using altmap  */
 	}
@@ -2067,13 +2072,14 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 
 static int get_vmemmap_altmap_cb(struct memory_block *mem, void *arg)
 {
-	struct vmem_altmap *altmap = (struct vmem_altmap *)arg;
+	struct vmem_altmap **altmap = (struct vmem_altmap **)arg;
 	/*
 	 * If we have any pages allocated from altmap
 	 * return the altmap details and break callback.
 	 */
 	if (mem->altmap) {
-		memcpy(altmap, mem->altmap, sizeof(struct vmem_altmap));
+		*altmap = mem->altmap;
+		mem->altmap = NULL;
 		return 1;
 	}
 	return 0;
@@ -2152,7 +2158,7 @@ EXPORT_SYMBOL(try_offline_node);
 static int __ref try_remove_memory(u64 start, u64 size)
 {
 	int ret;
-	struct vmem_altmap mhp_altmap, *altmap = NULL;
+	struct vmem_altmap *altmap = NULL;
 	int rc = 0, nid = NUMA_NO_NODE;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
@@ -2174,7 +2180,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
 	 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
 	 * the same granularity it was added - a single memory block.
 	 */
-	ret = walk_memory_blocks(start, size, &mhp_altmap,
+	ret = walk_memory_blocks(start, size, &altmap,
 				 get_vmemmap_altmap_cb);
 	if (ret) {
 		if (size != memory_block_size_bytes()) {
@@ -2183,7 +2189,6 @@ static int __ref try_remove_memory(u64 start, u64 size)
 				start, start + size);
 			return -EINVAL;
 		}
-		altmap = &mhp_altmap;
 	}
 
 	/* remove memmap entry */
@@ -2203,8 +2208,10 @@ static int __ref try_remove_memory(u64 start, u64 size)
 	 * Now that we are tracking alloc and free correctly
 	 * we can add check to verify altmap free pages.
 	 */
-	if (altmap)
+	if (altmap) {
 		WARN(altmap->alloc, "Altmap not fully unmapped");
+		kfree(altmap);
+	}
 
 	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
 		memblock_phys_free(start, size);
David Hildenbrand July 26, 2023, 4:43 p.m. UTC | #3
On 26.07.23 12:31, Aneesh Kumar K.V wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 25.07.23 12:02, Aneesh Kumar K.V wrote:
>>> With memmap on memory, some architecture needs more details w.r.t altmap
>>> such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
>>> computing them again when we remove a memory block, embed vmem_altmap
>>> details in struct memory_block if we are using memmap on memory block
>>> feature.
>>>
>>> No functional change in this patch
>>>
>>> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
>>> ---
>>
>> [...]
>>
>>>    
>>>    static int add_memory_block(unsigned long block_id, unsigned long state,
>>> -			    unsigned long nr_vmemmap_pages,
>>> +			    struct vmem_altmap *altmap,
>>>    			    struct memory_group *group)
>>>    {
>>>    	struct memory_block *mem;
>>> @@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, unsigned long state,
>>>    	mem->start_section_nr = block_id * sections_per_block;
>>>    	mem->state = state;
>>>    	mem->nid = NUMA_NO_NODE;
>>> -	mem->nr_vmemmap_pages = nr_vmemmap_pages;
>>> +	if (altmap) {
>>> +		mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
>>> +		if (!mem->altmap) {
>>> +			kfree(mem);
>>> +			return -ENOMEM;
>>> +		}
>>> +		memcpy(mem->altmap, altmap, sizeof(*altmap));
>>> +	}
>>
>> I'm wondering if we should instead let the caller do the alloc/free. So we would alloc
>> int the caller and would only store the pointer.
>>
>> Before removing the memory block, we would clear the pointer and free it in the caller.
>>
>> IOW, when removing a memory block and we still have an altmap set, something would be wrong.
>>
>> See below on try_remove_memory() handling.
>>
>> [...]
>>
>>> -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
>>> +static int get_vmemmap_altmap_cb(struct memory_block *mem, void *arg)
>>>    {
>>> +	struct vmem_altmap *altmap = (struct vmem_altmap *)arg;
>>>    	/*
>>> -	 * If not set, continue with the next block.
>>> +	 * If we have any pages allocated from altmap
>>> +	 * return the altmap details and break callback.
>>>    	 */
>>> -	return mem->nr_vmemmap_pages;
>>> +	if (mem->altmap) {
>>> +		memcpy(altmap, mem->altmap, sizeof(struct vmem_altmap));
>>> +		return 1;
>>> +	}
>>> +	return 0;
>>>    }
>>>    
>>>    static int check_cpu_on_node(int nid)
>>> @@ -2146,9 +2152,8 @@ EXPORT_SYMBOL(try_offline_node);
>>>    
>>>    static int __ref try_remove_memory(u64 start, u64 size)
>>>    {
>>> -	struct vmem_altmap mhp_altmap = {};
>>> -	struct vmem_altmap *altmap = NULL;
>>> -	unsigned long nr_vmemmap_pages;
>>> +	int ret;
>>> +	struct vmem_altmap mhp_altmap, *altmap = NULL;
>>>    	int rc = 0, nid = NUMA_NO_NODE;
>>>    
>>>    	BUG_ON(check_hotplug_memory_range(start, size));
>>> @@ -2171,24 +2176,15 @@ static int __ref try_remove_memory(u64 start, u64 size)
>>>    	 * the same granularity it was added - a single memory block.
>>>    	 */
>>>    	if (mhp_memmap_on_memory()) {
>>> -		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
>>> -						      get_nr_vmemmap_pages_cb);
>>> -		if (nr_vmemmap_pages) {
>>> +		ret = walk_memory_blocks(start, size, &mhp_altmap,
>>> +					 get_vmemmap_altmap_cb);
>>> +		if (ret) {
>>>    			if (size != memory_block_size_bytes()) {
>>>    				pr_warn("Refuse to remove %#llx - %#llx,"
>>>    					"wrong granularity\n",
>>>    					start, start + size);
>>>    				return -EINVAL;
>>>    			}
>>> -
>>> -			/*
>>> -			 * Let remove_pmd_table->free_hugepage_table do the
>>> -			 * right thing if we used vmem_altmap when hot-adding
>>> -			 * the range.
>>> -			 */
>>> -			mhp_altmap.base_pfn = PHYS_PFN(start);
>>> -			mhp_altmap.free = nr_vmemmap_pages;
>>> -			mhp_altmap.alloc = nr_vmemmap_pages;
>>>    			altmap = &mhp_altmap;
>>>    		}
>>
>>
>> Instead of that, I suggest (whitespace damage expected):
>>
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 3f231cf1b410..f6860df64549 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1956,12 +1956,19 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
>>           return 0;
>>    }
>>    
>> -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
>> +static int test_has_altmap_cb(struct memory_block *mem, void *arg)
>>    {
>> -       /*
>> -        * If not set, continue with the next block.
>> -        */
>> -       return mem->nr_vmemmap_pages;
>> +       struct memory_block **mem_ptr = (struct memory_block **)arg;
>> +
>> +       if (mem->altmap) {
>> +               /*
>> +                * We're not taking a reference on the memory block; it
>> +                * it cannot vanish while we're about to that memory ourselves.
>> +                */
>> +               *mem_ptr = mem;
>> +               return 1;
>> +       }
>> +       return 0;
>>    }
>>    
>>    static int check_cpu_on_node(int nid)
>> @@ -2036,9 +2043,7 @@ EXPORT_SYMBOL(try_offline_node);
>>    
>>    static int __ref try_remove_memory(u64 start, u64 size)
>>    {
>> -       struct vmem_altmap mhp_altmap = {};
>>           struct vmem_altmap *altmap = NULL;
>> -       unsigned long nr_vmemmap_pages;
>>           int rc = 0, nid = NUMA_NO_NODE;
>>    
>>           BUG_ON(check_hotplug_memory_range(start, size));
>> @@ -2061,9 +2066,9 @@ static int __ref try_remove_memory(u64 start, u64 size)
>>            * the same granularity it was added - a single memory block.
>>            */
>>           if (mhp_memmap_on_memory()) {
>> -               nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
>> -                                                     get_nr_vmemmap_pages_cb);
>> -               if (nr_vmemmap_pages) {
>> +               struct memory_block *mem;
>> +
>> +               if (walk_memory_blocks(start, size, &mem, test_has_altmap_cb)) {
>>                           if (size != memory_block_size_bytes()) {
>>                                   pr_warn("Refuse to remove %#llx - %#llx,"
>>                                           "wrong granularity\n",
>> @@ -2072,12 +2077,11 @@ static int __ref try_remove_memory(u64 start, u64 size)
>>                           }
>>    
>>                           /*
>> -                        * Let remove_pmd_table->free_hugepage_table do the
>> -                        * right thing if we used vmem_altmap when hot-adding
>> -                        * the range.
>> +                        * Clear the altmap from the memory block before we
>> +                        * remove it; we'll take care of freeing the altmap.
>>                            */
>> -                       mhp_altmap.alloc = nr_vmemmap_pages;
>> -                       altmap = &mhp_altmap;
>> +                       altmap = mem->altmap;
>> +                       mem->altmap = NULL;
>>                   }
>>           }
>>    
>> @@ -2094,6 +2098,9 @@ static int __ref try_remove_memory(u64 start, u64 size)
>>    
>>           arch_remove_memory(start, size, altmap);
>>    
>> +       if (altmap)
>> +               kfree(altmap);
>> +
>>           if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
>>                   memblock_phys_free(start, size);
>>                   memblock_remove(start, size);
>>
> 
> Is this any better. Any specific reason we want the alloc and free in
> the caller?


IMHO if you only have a single instance you don't have to worry about 
any inconsistencies. At least to me it looks cleaner than this copying 
back and forth.

Below is better, but as you pointed out, your get_vmemmap_altmap_cb() 
change is problematic/insufficient.
diff mbox series

Patch

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..0210ed7b7696 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -106,6 +106,7 @@  static void memory_block_release(struct device *dev)
 {
 	struct memory_block *mem = to_memory_block(dev);
 
+	kfree(mem->altmap);
 	kfree(mem);
 }
 
@@ -183,7 +184,7 @@  static int memory_block_online(struct memory_block *mem)
 {
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	unsigned long nr_vmemmap_pages = 0;
 	struct zone *zone;
 	int ret;
 
@@ -200,6 +201,9 @@  static int memory_block_online(struct memory_block *mem)
 	 * stage helps to keep accounting easier to follow - e.g vmemmaps
 	 * belong to the same zone as the memory they backed.
 	 */
+	if (mem->altmap)
+		nr_vmemmap_pages = mem->altmap->free;
+
 	if (nr_vmemmap_pages) {
 		ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
 		if (ret)
@@ -230,7 +234,7 @@  static int memory_block_offline(struct memory_block *mem)
 {
 	unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
 	unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-	unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+	unsigned long nr_vmemmap_pages = 0;
 	int ret;
 
 	if (!mem->zone)
@@ -240,6 +244,9 @@  static int memory_block_offline(struct memory_block *mem)
 	 * Unaccount before offlining, such that unpopulated zone and kthreads
 	 * can properly be torn down in offline_pages().
 	 */
+	if (mem->altmap)
+		nr_vmemmap_pages = mem->altmap->free;
+
 	if (nr_vmemmap_pages)
 		adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
 					  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@  void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-			    unsigned long nr_vmemmap_pages,
+			    struct vmem_altmap *altmap,
 			    struct memory_group *group)
 {
 	struct memory_block *mem;
@@ -744,7 +751,14 @@  static int add_memory_block(unsigned long block_id, unsigned long state,
 	mem->start_section_nr = block_id * sections_per_block;
 	mem->state = state;
 	mem->nid = NUMA_NO_NODE;
-	mem->nr_vmemmap_pages = nr_vmemmap_pages;
+	if (altmap) {
+		mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+		if (!mem->altmap) {
+			kfree(mem);
+			return -ENOMEM;
+		}
+		memcpy(mem->altmap, altmap, sizeof(*altmap));
+	}
 	INIT_LIST_HEAD(&mem->group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +797,14 @@  static int __init add_boot_memory_block(unsigned long base_section_nr)
 	if (section_count == 0)
 		return 0;
 	return add_memory_block(memory_block_id(base_section_nr),
-				MEM_ONLINE, 0,  NULL);
+				MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-				    unsigned long nr_vmemmap_pages,
+				    struct vmem_altmap *altmap,
 				    struct memory_group *group)
 {
-	return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+	return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +832,7 @@  static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-				unsigned long vmemmap_pages,
+				struct vmem_altmap *altmap,
 				struct memory_group *group)
 {
 	const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +846,7 @@  int create_memory_block_devices(unsigned long start, unsigned long size,
 		return -EINVAL;
 
 	for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-		ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
+		ret = add_hotplug_memory_block(block_id, altmap, group);
 		if (ret)
 			break;
 	}
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 31343566c221..f53cfdaaaa41 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -77,11 +77,7 @@  struct memory_block {
 	 */
 	struct zone *zone;
 	struct device dev;
-	/*
-	 * Number of vmemmap pages. These pages
-	 * lay at the beginning of the memory block.
-	 */
-	unsigned long nr_vmemmap_pages;
+	struct vmem_altmap *altmap;
 	struct memory_group *group;	/* group (if any) for this block */
 	struct list_head group_next;	/* next block inside memory group */
 #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG)
@@ -147,7 +143,7 @@  static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri)
 extern int register_memory_notifier(struct notifier_block *nb);
 extern void unregister_memory_notifier(struct notifier_block *nb);
 int create_memory_block_devices(unsigned long start, unsigned long size,
-				unsigned long vmemmap_pages,
+				struct vmem_altmap *altmap,
 				struct memory_group *group);
 void remove_memory_block_devices(unsigned long start, unsigned long size);
 extern void memory_dev_init(void);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 5b472e137898..96e794f39313 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1460,7 +1460,7 @@  int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
 		goto error;
 
 	/* create memory block devices after memory was added */
-	ret = create_memory_block_devices(start, size, mhp_altmap.free, group);
+	ret = create_memory_block_devices(start, size, params.altmap, group);
 	if (ret) {
 		arch_remove_memory(start, size, NULL);
 		goto error;
@@ -2066,12 +2066,18 @@  static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return 0;
 }
 
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
+static int get_vmemmap_altmap_cb(struct memory_block *mem, void *arg)
 {
+	struct vmem_altmap *altmap = (struct vmem_altmap *)arg;
 	/*
-	 * If not set, continue with the next block.
+	 * If we have any pages allocated from altmap
+	 * return the altmap details and break callback.
 	 */
-	return mem->nr_vmemmap_pages;
+	if (mem->altmap) {
+		memcpy(altmap, mem->altmap, sizeof(struct vmem_altmap));
+		return 1;
+	}
+	return 0;
 }
 
 static int check_cpu_on_node(int nid)
@@ -2146,9 +2152,8 @@  EXPORT_SYMBOL(try_offline_node);
 
 static int __ref try_remove_memory(u64 start, u64 size)
 {
-	struct vmem_altmap mhp_altmap = {};
-	struct vmem_altmap *altmap = NULL;
-	unsigned long nr_vmemmap_pages;
+	int ret;
+	struct vmem_altmap mhp_altmap, *altmap = NULL;
 	int rc = 0, nid = NUMA_NO_NODE;
 
 	BUG_ON(check_hotplug_memory_range(start, size));
@@ -2171,24 +2176,15 @@  static int __ref try_remove_memory(u64 start, u64 size)
 	 * the same granularity it was added - a single memory block.
 	 */
 	if (mhp_memmap_on_memory()) {
-		nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
-						      get_nr_vmemmap_pages_cb);
-		if (nr_vmemmap_pages) {
+		ret = walk_memory_blocks(start, size, &mhp_altmap,
+					 get_vmemmap_altmap_cb);
+		if (ret) {
 			if (size != memory_block_size_bytes()) {
 				pr_warn("Refuse to remove %#llx - %#llx,"
 					"wrong granularity\n",
 					start, start + size);
 				return -EINVAL;
 			}
-
-			/*
-			 * Let remove_pmd_table->free_hugepage_table do the
-			 * right thing if we used vmem_altmap when hot-adding
-			 * the range.
-			 */
-			mhp_altmap.base_pfn = PHYS_PFN(start);
-			mhp_altmap.free = nr_vmemmap_pages;
-			mhp_altmap.alloc = nr_vmemmap_pages;
 			altmap = &mhp_altmap;
 		}
 	}
@@ -2206,6 +2202,13 @@  static int __ref try_remove_memory(u64 start, u64 size)
 
 	arch_remove_memory(start, size, altmap);
 
+	/*
+	 * Now that we are tracking alloc and free correctly
+	 * we can add check to verify altmap free pages.
+	 */
+	if (altmap)
+		WARN(altmap->alloc, "Altmap not fully unmapped");
+
 	if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
 		memblock_phys_free(start, size);
 		memblock_remove(start, size);