diff mbox series

[2/2] mm, memory_hotplug: provide a more generic restrictions for memory hotplug

Message ID 20190404125916.10215-3-osalvador@suse.de (mailing list archive)
State New, archived
Headers show
Series Preparing memhotplug for allocating memmap from hot-added range | expand

Commit Message

Oscar Salvador April 4, 2019, 12:59 p.m. UTC
From: Michal Hocko <mhocko@suse.com>

arch_add_memory, __add_pages take a want_memblock which controls whether
the newly added memory should get the sysfs memblock user API (e.g.
ZONE_DEVICE users do not want/need this interface). Some callers even
want to control where do we allocate the memmap from by configuring
altmap.

Add a more generic hotplug context for arch_add_memory and __add_pages.
struct mhp_restrictions contains flags which contains additional
features to be enabled by the memory hotplug (MHP_MEMBLOCK_API
currently) and altmap for alternative memmap allocator.

This patch shouldn't introduce any functional change.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 arch/arm64/mm/mmu.c            |  6 +++---
 arch/ia64/mm/init.c            |  6 +++---
 arch/powerpc/mm/mem.c          |  6 +++---
 arch/s390/mm/init.c            |  6 +++---
 arch/sh/mm/init.c              |  6 +++---
 arch/x86/mm/init_32.c          |  6 +++---
 arch/x86/mm/init_64.c          | 10 +++++-----
 include/linux/memory_hotplug.h | 29 +++++++++++++++++++++++------
 kernel/memremap.c              | 10 +++++++---
 mm/memory_hotplug.c            | 10 ++++++----
 10 files changed, 59 insertions(+), 36 deletions(-)

Comments

David Hildenbrand April 4, 2019, 2:57 p.m. UTC | #1
On 04.04.19 14:59, Oscar Salvador wrote:
> From: Michal Hocko <mhocko@suse.com>
> 
> arch_add_memory, __add_pages take a want_memblock which controls whether
> the newly added memory should get the sysfs memblock user API (e.g.
> ZONE_DEVICE users do not want/need this interface). Some callers even
> want to control where do we allocate the memmap from by configuring
> altmap.
> 
> Add a more generic hotplug context for arch_add_memory and __add_pages.
> struct mhp_restrictions contains flags which contains additional
> features to be enabled by the memory hotplug (MHP_MEMBLOCK_API
> currently) and altmap for alternative memmap allocator.
> 
> This patch shouldn't introduce any functional change.
> 
> Signed-off-by: Michal Hocko <mhocko@suse.com>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>  arch/arm64/mm/mmu.c            |  6 +++---
>  arch/ia64/mm/init.c            |  6 +++---
>  arch/powerpc/mm/mem.c          |  6 +++---
>  arch/s390/mm/init.c            |  6 +++---
>  arch/sh/mm/init.c              |  6 +++---
>  arch/x86/mm/init_32.c          |  6 +++---
>  arch/x86/mm/init_64.c          | 10 +++++-----
>  include/linux/memory_hotplug.h | 29 +++++++++++++++++++++++------
>  kernel/memremap.c              | 10 +++++++---
>  mm/memory_hotplug.c            | 10 ++++++----
>  10 files changed, 59 insertions(+), 36 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index e6acfa7be4c7..db509550329d 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -1046,8 +1046,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		    bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size,
> +			struct mhp_restrictions *restrictions)

Should the restrictions be marked const?

>  {
>  	int flags = 0;
>  
> @@ -1058,6 +1058,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>  			     size, PAGE_KERNEL, pgd_pgtable_alloc, flags);
>  
>  	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
> -			   altmap, want_memblock);
> +							restrictions);

Again, some strange alignment thingies going on here :)

>  }
>  #endif
> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
> index e49200e31750..379eb1f9adc9 100644
> --- a/arch/ia64/mm/init.c
> +++ b/arch/ia64/mm/init.c
> @@ -666,14 +666,14 @@ mem_init (void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size,
> +			struct mhp_restrictions *restrictions)
>  {
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  	int ret;
>  
> -	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>  	if (ret)
>  		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
>  		       __func__,  ret);
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 1aa27aac73c5..76deaa8525db 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
>  	return -ENODEV;
>  }
>  
> -int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +int __meminit arch_add_memory(int nid, u64 start, u64 size,
> +			struct mhp_restrictions *restrictions)
>  {
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
> @@ -127,7 +127,7 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
>  	}
>  	flush_inval_dcache_range(start, start + size);
>  
> -	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	return __add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
> index 25e3113091ea..f5db961ad792 100644
> --- a/arch/s390/mm/init.c
> +++ b/arch/s390/mm/init.c
> @@ -216,8 +216,8 @@ device_initcall(s390_cma_mem_init);
>  
>  #endif /* CONFIG_CMA */
>  
> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size,
> +		struct mhp_restrictions *restrictions)
>  {
>  	unsigned long start_pfn = PFN_DOWN(start);
>  	unsigned long size_pages = PFN_DOWN(size);
> @@ -227,7 +227,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>  	if (rc)
>  		return rc;
>  
> -	rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
> +	rc = __add_pages(nid, start_pfn, size_pages, restrictions);
>  	if (rc)
>  		vmem_remove_mapping(start, size);
>  	return rc;
> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
> index 8e004b2f1a6a..168d3a6b9358 100644
> --- a/arch/sh/mm/init.c
> +++ b/arch/sh/mm/init.c
> @@ -404,15 +404,15 @@ void __init mem_init(void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size,
> +			struct mhp_restrictions *restrictions)
>  {
>  	unsigned long start_pfn = PFN_DOWN(start);
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  	int ret;
>  
>  	/* We only have ZONE_NORMAL, so this is easy.. */
> -	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>  	if (unlikely(ret))
>  		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
>  
> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 85c94f9a87f8..755dbed85531 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -850,13 +850,13 @@ void __init mem_init(void)
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size,
> +			struct mhp_restrictions *restrictions)
>  {
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  
> -	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	return __add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index bccff68e3267..db42c11b48fb 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
>  }
>  
>  int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
> -		struct vmem_altmap *altmap, bool want_memblock)
> +				struct mhp_restrictions *restrictions)
>  {
>  	int ret;
>  
> -	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>  	WARN_ON_ONCE(ret);
>  
>  	/* update max_pfn, max_low_pfn and high_memory */
> @@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
>  	return ret;
>  }
>  
> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +int arch_add_memory(int nid, u64 start, u64 size,
> +			struct mhp_restrictions *restrictions)
>  {
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>  
>  	init_memory_mapping(start, start + size);
>  
> -	return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	return add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
>  
>  #define PAGE_INUSE 0xFD
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 3c8cf347804c..5bd4b56f639c 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -118,20 +118,37 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
>  	unsigned long nr_pages, struct vmem_altmap *altmap);
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
> +/*
> + * Do we want sysfs memblock files created. This will allow userspace to online
> + * and offline memory explicitly. Lack of this bit means that the caller has to
> + * call move_pfn_range_to_zone to finish the initialization.
> + */

I think you can be more precise here.

"Create memory block devices for added pages. This is usually the case
for all system ram (and only system ram), as only this way memory can be
onlined/offlined by user space and kdump to correctly detect the new
memory using udev events."

Maybe we should even go a step further and call this

MHP_SYSTEM_RAM

Because that is what it is right now.

> +
> +#define MHP_MEMBLOCK_API               (1<<0)
> +
> +/*
> + * Restrictions for the memory hotplug:
> + * flags:  MHP_ flags
> + * altmap: alternative allocator for memmap array
> + */
> +struct mhp_restrictions {
> +	unsigned long flags;
> +	struct vmem_altmap *altmap;
> +};
> +
>  /* reasonably generic interface to expand the physical pages */
>  extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
> -		struct vmem_altmap *altmap, bool want_memblock);
> +					struct mhp_restrictions *restrictions);
>  
>  #ifndef CONFIG_ARCH_HAS_ADD_PAGES
>  static inline int add_pages(int nid, unsigned long start_pfn,
> -		unsigned long nr_pages, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +		unsigned long nr_pages, struct mhp_restrictions *restrictions)
>  {
> -	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
> +	return __add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
>  #else /* ARCH_HAS_ADD_PAGES */
>  int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
> -		struct vmem_altmap *altmap, bool want_memblock);
> +				struct mhp_restrictions *restrictions);

dito alignment. You have tabs configured to 8 characters, right?

>  #endif /* ARCH_HAS_ADD_PAGES */
>  
>  #ifdef CONFIG_NUMA
> @@ -333,7 +350,7 @@ extern int __add_memory(int nid, u64 start, u64 size);
>  extern int add_memory(int nid, u64 start, u64 size);
>  extern int add_memory_resource(int nid, struct resource *resource);
>  extern int arch_add_memory(int nid, u64 start, u64 size,
> -		struct vmem_altmap *altmap, bool want_memblock);
> +			struct mhp_restrictions *restrictions);
>  extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
>  		unsigned long nr_pages, struct vmem_altmap *altmap);
>  extern bool is_memblock_offlined(struct memory_block *mem);
> diff --git a/kernel/memremap.c b/kernel/memremap.c
> index a856cb5ff192..cc5e3e34417d 100644
> --- a/kernel/memremap.c
> +++ b/kernel/memremap.c
> @@ -149,6 +149,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	struct resource *res = &pgmap->res;
>  	struct dev_pagemap *conflict_pgmap;
>  	pgprot_t pgprot = PAGE_KERNEL;
> +	struct mhp_restrictions restrictions = {};
>  	int error, nid, is_ram;
>  
>  	if (!pgmap->ref || !pgmap->kill)
> @@ -199,6 +200,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	if (error)
>  		goto err_pfn_remap;
>  
> +	/* We do not want any optional features only our own memmap */
> +	restrictions.altmap = altmap;
> +>  	mem_hotplug_begin();
>  
>  	/*
> @@ -214,7 +218,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  	 */
>  	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
>  		error = add_pages(nid, align_start >> PAGE_SHIFT,
> -				align_size >> PAGE_SHIFT, NULL, false);
> +				align_size >> PAGE_SHIFT, &restrictions);
>  	} else {
>  		error = kasan_add_zero_shadow(__va(align_start), align_size);
>  		if (error) {
> @@ -222,8 +226,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>  			goto err_kasan;
>  		}
>  
> -		error = arch_add_memory(nid, align_start, align_size, altmap,
> -				false);
> +		error = arch_add_memory(nid, align_start, align_size,
> +							&restrictions);

dito alignment

>  	}
>  
>  	if (!error) {
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index d8a3e9554aec..50f77e059457 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -274,12 +274,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
>   * add the new pages.
>   */
>  int __ref __add_pages(int nid, unsigned long phys_start_pfn,
> -		unsigned long nr_pages, struct vmem_altmap *altmap,
> -		bool want_memblock)
> +		unsigned long nr_pages, struct mhp_restrictions *restrictions)
>  {
>  	unsigned long i;
>  	int err = 0;
>  	int start_sec, end_sec;
> +	struct vmem_altmap *altmap = restrictions->altmap;
>  
>  	/* during initialize mem_map, align hot-added range to section */
>  	start_sec = pfn_to_section_nr(phys_start_pfn);
> @@ -300,7 +300,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
>  
>  	for (i = start_sec; i <= end_sec; i++) {
>  		err = __add_section(nid, section_nr_to_pfn(i), altmap,
> -				want_memblock);
> +				restrictions->flags & MHP_MEMBLOCK_API);
>  
>  		/*
>  		 * EEXIST is finally dealt with by ioresource collision
> @@ -1102,6 +1102,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
>  	u64 start, size;
>  	bool new_node = false;
>  	int ret;
> +	struct mhp_restrictions restrictions = {};

I'd make this the very first variable.

Also eventually

struct mhp_restrictions restrictions = {
	.restrictions = MHP_MEMBLOCK_API
};

>  
>  	start = res->start;
>  	size = resource_size(res);
> @@ -1126,7 +1127,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
>  	new_node = ret;
>  
>  	/* call arch's memory hotadd */
> -	ret = arch_add_memory(nid, start, size, NULL, true);
> +	restrictions.flags = MHP_MEMBLOCK_API;
> +	ret = arch_add_memory(nid, start, size, &restrictions);
>  	if (ret < 0)
>  		goto error;
>  
>
David Hildenbrand April 4, 2019, 3:02 p.m. UTC | #2
On 04.04.19 16:57, David Hildenbrand wrote:
> On 04.04.19 14:59, Oscar Salvador wrote:
>> From: Michal Hocko <mhocko@suse.com>
>>
>> arch_add_memory, __add_pages take a want_memblock which controls whether
>> the newly added memory should get the sysfs memblock user API (e.g.
>> ZONE_DEVICE users do not want/need this interface). Some callers even
>> want to control where do we allocate the memmap from by configuring
>> altmap.
>>
>> Add a more generic hotplug context for arch_add_memory and __add_pages.
>> struct mhp_restrictions contains flags which contains additional
>> features to be enabled by the memory hotplug (MHP_MEMBLOCK_API
>> currently) and altmap for alternative memmap allocator.
>>
>> This patch shouldn't introduce any functional change.
>>
>> Signed-off-by: Michal Hocko <mhocko@suse.com>
>> Signed-off-by: Oscar Salvador <osalvador@suse.de>
>> ---
>>  arch/arm64/mm/mmu.c            |  6 +++---
>>  arch/ia64/mm/init.c            |  6 +++---
>>  arch/powerpc/mm/mem.c          |  6 +++---
>>  arch/s390/mm/init.c            |  6 +++---
>>  arch/sh/mm/init.c              |  6 +++---
>>  arch/x86/mm/init_32.c          |  6 +++---
>>  arch/x86/mm/init_64.c          | 10 +++++-----
>>  include/linux/memory_hotplug.h | 29 +++++++++++++++++++++++------
>>  kernel/memremap.c              | 10 +++++++---
>>  mm/memory_hotplug.c            | 10 ++++++----
>>  10 files changed, 59 insertions(+), 36 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index e6acfa7be4c7..db509550329d 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1046,8 +1046,8 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		    bool want_memblock)
>> +int arch_add_memory(int nid, u64 start, u64 size,
>> +			struct mhp_restrictions *restrictions)
> 
> Should the restrictions be marked const?
> 
>>  {
>>  	int flags = 0;
>>  
>> @@ -1058,6 +1058,6 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>>  			     size, PAGE_KERNEL, pgd_pgtable_alloc, flags);
>>  
>>  	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
>> -			   altmap, want_memblock);
>> +							restrictions);
> 
> Again, some strange alignment thingies going on here :)
> 
>>  }
>>  #endif
>> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
>> index e49200e31750..379eb1f9adc9 100644
>> --- a/arch/ia64/mm/init.c
>> +++ b/arch/ia64/mm/init.c
>> @@ -666,14 +666,14 @@ mem_init (void)
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +int arch_add_memory(int nid, u64 start, u64 size,
>> +			struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>>  	int ret;
>>  
>> -	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  	if (ret)
>>  		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
>>  		       __func__,  ret);
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 1aa27aac73c5..76deaa8525db 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -109,8 +109,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end)
>>  	return -ENODEV;
>>  }
>>  
>> -int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +int __meminit arch_add_memory(int nid, u64 start, u64 size,
>> +			struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>> @@ -127,7 +127,7 @@ int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
>>  	}
>>  	flush_inval_dcache_range(start, start + size);
>>  
>> -	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	return __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
>> index 25e3113091ea..f5db961ad792 100644
>> --- a/arch/s390/mm/init.c
>> +++ b/arch/s390/mm/init.c
>> @@ -216,8 +216,8 @@ device_initcall(s390_cma_mem_init);
>>  
>>  #endif /* CONFIG_CMA */
>>  
>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +int arch_add_memory(int nid, u64 start, u64 size,
>> +		struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long start_pfn = PFN_DOWN(start);
>>  	unsigned long size_pages = PFN_DOWN(size);
>> @@ -227,7 +227,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>>  	if (rc)
>>  		return rc;
>>  
>> -	rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
>> +	rc = __add_pages(nid, start_pfn, size_pages, restrictions);
>>  	if (rc)
>>  		vmem_remove_mapping(start, size);
>>  	return rc;
>> diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
>> index 8e004b2f1a6a..168d3a6b9358 100644
>> --- a/arch/sh/mm/init.c
>> +++ b/arch/sh/mm/init.c
>> @@ -404,15 +404,15 @@ void __init mem_init(void)
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +int arch_add_memory(int nid, u64 start, u64 size,
>> +			struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long start_pfn = PFN_DOWN(start);
>>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>>  	int ret;
>>  
>>  	/* We only have ZONE_NORMAL, so this is easy.. */
>> -	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  	if (unlikely(ret))
>>  		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
>>  
>> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
>> index 85c94f9a87f8..755dbed85531 100644
>> --- a/arch/x86/mm/init_32.c
>> +++ b/arch/x86/mm/init_32.c
>> @@ -850,13 +850,13 @@ void __init mem_init(void)
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +int arch_add_memory(int nid, u64 start, u64 size,
>> +			struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>>  
>> -	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	return __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index bccff68e3267..db42c11b48fb 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -777,11 +777,11 @@ static void update_end_of_memory_vars(u64 start, u64 size)
>>  }
>>  
>>  int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
>> -		struct vmem_altmap *altmap, bool want_memblock)
>> +				struct mhp_restrictions *restrictions)
>>  {
>>  	int ret;
>>  
>> -	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  	WARN_ON_ONCE(ret);
>>  
>>  	/* update max_pfn, max_low_pfn and high_memory */
>> @@ -791,15 +791,15 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
>>  	return ret;
>>  }
>>  
>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +int arch_add_memory(int nid, u64 start, u64 size,
>> +			struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>>  
>>  	init_memory_mapping(start, start + size);
>>  
>> -	return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	return add_pages(nid, start_pfn, nr_pages, restrictions);
>>  }
>>  
>>  #define PAGE_INUSE 0xFD
>> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>> index 3c8cf347804c..5bd4b56f639c 100644
>> --- a/include/linux/memory_hotplug.h
>> +++ b/include/linux/memory_hotplug.h
>> @@ -118,20 +118,37 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
>>  	unsigned long nr_pages, struct vmem_altmap *altmap);
>>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>>  
>> +/*
>> + * Do we want sysfs memblock files created. This will allow userspace to online
>> + * and offline memory explicitly. Lack of this bit means that the caller has to
>> + * call move_pfn_range_to_zone to finish the initialization.
>> + */
> 
> I think you can be more precise here.
> 
> "Create memory block devices for added pages. This is usually the case
> for all system ram (and only system ram), as only this way memory can be
> onlined/offlined by user space and kdump to correctly detect the new
> memory using udev events."
> 
> Maybe we should even go a step further and call this
> 
> MHP_SYSTEM_RAM
> 
> Because that is what it is right now.
> 
>> +
>> +#define MHP_MEMBLOCK_API               (1<<0)
>> +
>> +/*
>> + * Restrictions for the memory hotplug:
>> + * flags:  MHP_ flags
>> + * altmap: alternative allocator for memmap array
>> + */
>> +struct mhp_restrictions {
>> +	unsigned long flags;
>> +	struct vmem_altmap *altmap;
>> +};
>> +
>>  /* reasonably generic interface to expand the physical pages */
>>  extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
>> -		struct vmem_altmap *altmap, bool want_memblock);
>> +					struct mhp_restrictions *restrictions);
>>  
>>  #ifndef CONFIG_ARCH_HAS_ADD_PAGES
>>  static inline int add_pages(int nid, unsigned long start_pfn,
>> -		unsigned long nr_pages, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +		unsigned long nr_pages, struct mhp_restrictions *restrictions)
>>  {
>> -	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
>> +	return __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  }
>>  #else /* ARCH_HAS_ADD_PAGES */
>>  int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
>> -		struct vmem_altmap *altmap, bool want_memblock);
>> +				struct mhp_restrictions *restrictions);
> 
> dito alignment. You have tabs configured to 8 characters, right?
> 
>>  #endif /* ARCH_HAS_ADD_PAGES */
>>  
>>  #ifdef CONFIG_NUMA
>> @@ -333,7 +350,7 @@ extern int __add_memory(int nid, u64 start, u64 size);
>>  extern int add_memory(int nid, u64 start, u64 size);
>>  extern int add_memory_resource(int nid, struct resource *resource);
>>  extern int arch_add_memory(int nid, u64 start, u64 size,
>> -		struct vmem_altmap *altmap, bool want_memblock);
>> +			struct mhp_restrictions *restrictions);
>>  extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
>>  		unsigned long nr_pages, struct vmem_altmap *altmap);
>>  extern bool is_memblock_offlined(struct memory_block *mem);
>> diff --git a/kernel/memremap.c b/kernel/memremap.c
>> index a856cb5ff192..cc5e3e34417d 100644
>> --- a/kernel/memremap.c
>> +++ b/kernel/memremap.c
>> @@ -149,6 +149,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>>  	struct resource *res = &pgmap->res;
>>  	struct dev_pagemap *conflict_pgmap;
>>  	pgprot_t pgprot = PAGE_KERNEL;
>> +	struct mhp_restrictions restrictions = {};
>>  	int error, nid, is_ram;
>>  
>>  	if (!pgmap->ref || !pgmap->kill)
>> @@ -199,6 +200,9 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>>  	if (error)
>>  		goto err_pfn_remap;
>>  
>> +	/* We do not want any optional features only our own memmap */
>> +	restrictions.altmap = altmap;
>> +>  	mem_hotplug_begin();
>>  
>>  	/*
>> @@ -214,7 +218,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>>  	 */
>>  	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
>>  		error = add_pages(nid, align_start >> PAGE_SHIFT,
>> -				align_size >> PAGE_SHIFT, NULL, false);
>> +				align_size >> PAGE_SHIFT, &restrictions);
>>  	} else {
>>  		error = kasan_add_zero_shadow(__va(align_start), align_size);
>>  		if (error) {
>> @@ -222,8 +226,8 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
>>  			goto err_kasan;
>>  		}
>>  
>> -		error = arch_add_memory(nid, align_start, align_size, altmap,
>> -				false);
>> +		error = arch_add_memory(nid, align_start, align_size,
>> +							&restrictions);
> 
> dito alignment
> 
>>  	}
>>  
>>  	if (!error) {
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index d8a3e9554aec..50f77e059457 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -274,12 +274,12 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
>>   * add the new pages.
>>   */
>>  int __ref __add_pages(int nid, unsigned long phys_start_pfn,
>> -		unsigned long nr_pages, struct vmem_altmap *altmap,
>> -		bool want_memblock)
>> +		unsigned long nr_pages, struct mhp_restrictions *restrictions)
>>  {
>>  	unsigned long i;
>>  	int err = 0;
>>  	int start_sec, end_sec;
>> +	struct vmem_altmap *altmap = restrictions->altmap;
>>  
>>  	/* during initialize mem_map, align hot-added range to section */
>>  	start_sec = pfn_to_section_nr(phys_start_pfn);
>> @@ -300,7 +300,7 @@ int __ref __add_pages(int nid, unsigned long phys_start_pfn,
>>  
>>  	for (i = start_sec; i <= end_sec; i++) {
>>  		err = __add_section(nid, section_nr_to_pfn(i), altmap,
>> -				want_memblock);
>> +				restrictions->flags & MHP_MEMBLOCK_API);
>>  
>>  		/*
>>  		 * EEXIST is finally dealt with by ioresource collision
>> @@ -1102,6 +1102,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
>>  	u64 start, size;
>>  	bool new_node = false;
>>  	int ret;
>> +	struct mhp_restrictions restrictions = {};
> 
> I'd make this the very first variable.
> 
> Also eventually
> 
> struct mhp_restrictions restrictions = {
> 	.restrictions = MHP_MEMBLOCK_API
> };
> 
>>  
>>  	start = res->start;
>>  	size = resource_size(res);
>> @@ -1126,7 +1127,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
>>  	new_node = ret;
>>  
>>  	/* call arch's memory hotadd */
>> -	ret = arch_add_memory(nid, start, size, NULL, true);
>> +	restrictions.flags = MHP_MEMBLOCK_API;
>> +	ret = arch_add_memory(nid, start, size, &restrictions);
>>  	if (ret < 0)
>>  		goto error;
>>  
>>
> 
> 

s/alignment/indentation/

I think I should take a nap :)
Oscar Salvador April 4, 2019, 6:01 p.m. UTC | #3
On Thu, Apr 04, 2019 at 04:57:03PM +0200, David Hildenbrand wrote:

> >  #ifdef CONFIG_MEMORY_HOTPLUG
> > -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
> > -		    bool want_memblock)
> > +int arch_add_memory(int nid, u64 start, u64 size,
> > +			struct mhp_restrictions *restrictions)
> 
> Should the restrictions be marked const?

We could, but maybe some platforms want to override something later on
depending on x or y configurations, so we could be more flexible here.

> > +/*
> > + * Do we want sysfs memblock files created. This will allow userspace to online
> > + * and offline memory explicitly. Lack of this bit means that the caller has to
> > + * call move_pfn_range_to_zone to finish the initialization.
> > + */
> 
> I think you can be more precise here.
> 
> "Create memory block devices for added pages. This is usually the case
> for all system ram (and only system ram), as only this way memory can be
> onlined/offlined by user space and kdump to correctly detect the new
> memory using udev events."
> 
> Maybe we should even go a step further and call this
> 
> MHP_SYSTEM_RAM
> 
> Because that is what it is right now.

I agree that that is nicer explanation, and I would not mind to add it.
In the end, the more information and commented code the better.

But I am not really convinced by MHP_SYSTEM_RAM name, and I think we should stick
with MHP_MEMBLOCK_API because it represents __what__ is that flag about and its
function, e.g: create memory block devices.

> > @@ -1102,6 +1102,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
> >  	u64 start, size;
> >  	bool new_node = false;
> >  	int ret;
> > +	struct mhp_restrictions restrictions = {};
> 
> I'd make this the very first variable.
> 
> Also eventually
> 
> struct mhp_restrictions restrictions = {
> 	.restrictions = MHP_MEMBLOCK_API
> };

It might be more right.
Actually, that is the way we tend to pre-initialize fields in structs.

About the identation, I  am really puzzled, I checked my branch and I
cannot see any space that should be a tab.
Maybe it got screwed up when sending it.

Anyway, thanks for the feedback David ;-)
David Hildenbrand April 4, 2019, 6:27 p.m. UTC | #4
On 04.04.19 20:01, Oscar Salvador wrote:
> On Thu, Apr 04, 2019 at 04:57:03PM +0200, David Hildenbrand wrote:
> 
>>>  #ifdef CONFIG_MEMORY_HOTPLUG
>>> -int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
>>> -		    bool want_memblock)
>>> +int arch_add_memory(int nid, u64 start, u64 size,
>>> +			struct mhp_restrictions *restrictions)
>>
>> Should the restrictions be marked const?
> 
> We could, but maybe some platforms want to override something later on
> depending on x or y configurations, so we could be more flexible here.
> 
>>> +/*
>>> + * Do we want sysfs memblock files created. This will allow userspace to online
>>> + * and offline memory explicitly. Lack of this bit means that the caller has to
>>> + * call move_pfn_range_to_zone to finish the initialization.
>>> + */
>>
>> I think you can be more precise here.
>>
>> "Create memory block devices for added pages. This is usually the case
>> for all system ram (and only system ram), as only this way memory can be
>> onlined/offlined by user space and kdump to correctly detect the new
>> memory using udev events."
>>
>> Maybe we should even go a step further and call this
>>
>> MHP_SYSTEM_RAM
>>
>> Because that is what it is right now.
> 
> I agree that that is nicer explanation, and I would not mind to add it.
> In the end, the more information and commented code the better.
> 
> But I am not really convinced by MHP_SYSTEM_RAM name, and I think we should stick
> with MHP_MEMBLOCK_API because it represents __what__ is that flag about and its
> function, e.g: create memory block devices.

This nicely aligns with the sub-section memory add support discussion.

MHP_MEMBLOCK_API immediately implies that

- memory is used as system ram. Memory can be onlined/offlined. Markers
  at sections indicate if the section is online/offline.
- memory has to follow certain restrictions (alignment + size multiple
  of memory block size)

IOW, if some ZONE_DEVICE memory would set this flag, very bad things
will happen. Especially, device memory with memory block devices should
also result in quite some issues (I remember it is checked somewhere).

System ram added without MHP_MEMBLOCK_API? What would happen? Memory can
never be onlined.

I feel like mixing these two interfaces - adding system memory vs.
adding device memory wasn't the best design choice. Lot of parameters to
set, but the some parameters are actually mutually exclusive. Especially
memory block devices are a difference.

Maybe having actual function cal variants like

__add_system_memory / arch_add_system_memory ...

and

__add_device_memory / arch_add_device_memory

would make things clearer. To me, it feels like we are trying to squeeze
too many different things into one function call path, allowing people
do do things that shouldn't be done.

Any opinions on the design/direction on these interfaces in general? I
don't see us moving away from memory block devices for system ram. But I
am seeing us moving towards sub-section hot-add support for anything not
system ram.

> 
>>> @@ -1102,6 +1102,7 @@ int __ref add_memory_resource(int nid, struct resource *res)
>>>  	u64 start, size;
>>>  	bool new_node = false;
>>>  	int ret;
>>> +	struct mhp_restrictions restrictions = {};
>>
>> I'd make this the very first variable.
>>
>> Also eventually
>>
>> struct mhp_restrictions restrictions = {
>> 	.restrictions = MHP_MEMBLOCK_API
>> };
> 
> It might be more right.
> Actually, that is the way we tend to pre-initialize fields in structs.
> 
> About the identation, I  am really puzzled, I checked my branch and I
> cannot see any space that should be a tab.
> Maybe it got screwed up when sending it.

It's not about spaces that should be tabs, rather about how many tabs to
use. But this is really just nit picking, because it usually directly
jumps into my eyes :) On vim with tabstop=8 it didn't look right.

> 
> Anyway, thanks for the feedback David ;-)
>
Michal Hocko April 5, 2019, 7:14 a.m. UTC | #5
On Thu 04-04-19 20:27:41, David Hildenbrand wrote:
> On 04.04.19 20:01, Oscar Salvador wrote:
[...]
> > But I am not really convinced by MHP_SYSTEM_RAM name, and I think we should stick
> > with MHP_MEMBLOCK_API because it represents __what__ is that flag about and its
> > function, e.g: create memory block devices.

Exactly

> This nicely aligns with the sub-section memory add support discussion.
> 
> MHP_MEMBLOCK_API immediately implies that
> 
> - memory is used as system ram. Memory can be onlined/offlined. Markers
>   at sections indicate if the section is online/offline.

No there is no implication like that. It means only that the onlined
memory has a sysfs interface. Nothing more, nothing less

This is an internal API so we are not carving anything into the stone.
So can we simply start with what we have and go from there? I am getting
felling that this discussion just makes the whole thing more muddy.
David Hildenbrand April 5, 2019, 8:05 a.m. UTC | #6
On 05.04.19 09:14, Michal Hocko wrote:
> On Thu 04-04-19 20:27:41, David Hildenbrand wrote:
>> On 04.04.19 20:01, Oscar Salvador wrote:
> [...]
>>> But I am not really convinced by MHP_SYSTEM_RAM name, and I think we should stick
>>> with MHP_MEMBLOCK_API because it represents __what__ is that flag about and its
>>> function, e.g: create memory block devices.
> 
> Exactly

Fine with me for keeping what Oscar has.

> 
>> This nicely aligns with the sub-section memory add support discussion.
>>
>> MHP_MEMBLOCK_API immediately implies that
>>
>> - memory is used as system ram. Memory can be onlined/offlined. Markers
>>   at sections indicate if the section is online/offline.
> 
> No there is no implication like that. It means only that the onlined
> memory has a sysfs interface. Nothing more, nothing less

As soon as there is a online/offline interface, you *can* (and user
space usually *will*) online that memory. Onlining/offlining is only
defined for memory to be added to the buddy - memory to be used as
"system ram". Doing it for random device memory will not work / result
in undefined behavior.

Not adding memory block devices for system ram will not allow user space
to online/offline it and break kdump reload for hot-added memory. But
memory can be onlined/offlined using internal APIs of course - if that's
what you were referring to.

> 
> This is an internal API so we are not carving anything into the stone.

That is true.

> So can we simply start with what we have and go from there?

Sure, what Oscar does here is just a simple refactoring of the interface
and I was just wondering if the interface needs a general overhaul.

>I am getting
> felling that this discussion just makes the whole thing more muddy.

I think this discussion is helpful to understand how the whole thing is
supposed to work :) At least on my side.


Meanwhile, I will have a look if memory block devices cannot simply be
created by the caller of arch_add_memory(). At least it feels like
creating memory bock devices could be factored out - I remember that it
was not that easy.
Michal Hocko April 5, 2019, 10:30 a.m. UTC | #7
On Fri 05-04-19 10:05:09, David Hildenbrand wrote:
> On 05.04.19 09:14, Michal Hocko wrote:
> > On Thu 04-04-19 20:27:41, David Hildenbrand wrote:
> >> On 04.04.19 20:01, Oscar Salvador wrote:
> > [...]
> >>> But I am not really convinced by MHP_SYSTEM_RAM name, and I think we should stick
> >>> with MHP_MEMBLOCK_API because it represents __what__ is that flag about and its
> >>> function, e.g: create memory block devices.
> > 
> > Exactly
> 
> Fine with me for keeping what Oscar has.
> 
> > 
> >> This nicely aligns with the sub-section memory add support discussion.
> >>
> >> MHP_MEMBLOCK_API immediately implies that
> >>
> >> - memory is used as system ram. Memory can be onlined/offlined. Markers
> >>   at sections indicate if the section is online/offline.
> > 
> > No there is no implication like that. It means only that the onlined
> > memory has a sysfs interface. Nothing more, nothing less
> 
> As soon as there is a online/offline interface, you *can* (and user
> space usually *will*) online that memory. Onlining/offlining is only
> defined for memory to be added to the buddy - memory to be used as
> "system ram". Doing it for random device memory will not work / result
> in undefined behavior.

No, not really. We really do not care where the memory comes from. Is it
RAM, NVDIMM, $FOO_BAR_OF_A_FUTURE_BUZZ. We only do care that the memory
can be onlined - user triggered associated with a zone. The memory even
doesn't have to go to the page allocator.
David Hildenbrand April 5, 2019, 10:56 a.m. UTC | #8
On 05.04.19 12:30, Michal Hocko wrote:
> On Fri 05-04-19 10:05:09, David Hildenbrand wrote:
>> On 05.04.19 09:14, Michal Hocko wrote:
>>> On Thu 04-04-19 20:27:41, David Hildenbrand wrote:
>>>> On 04.04.19 20:01, Oscar Salvador wrote:
>>> [...]
>>>>> But I am not really convinced by MHP_SYSTEM_RAM name, and I think we should stick
>>>>> with MHP_MEMBLOCK_API because it represents __what__ is that flag about and its
>>>>> function, e.g: create memory block devices.
>>>
>>> Exactly
>>
>> Fine with me for keeping what Oscar has.
>>
>>>
>>>> This nicely aligns with the sub-section memory add support discussion.
>>>>
>>>> MHP_MEMBLOCK_API immediately implies that
>>>>
>>>> - memory is used as system ram. Memory can be onlined/offlined. Markers
>>>>   at sections indicate if the section is online/offline.
>>>
>>> No there is no implication like that. It means only that the onlined
>>> memory has a sysfs interface. Nothing more, nothing less
>>
>> As soon as there is a online/offline interface, you *can* (and user
>> space usually *will*) online that memory. Onlining/offlining is only
>> defined for memory to be added to the buddy - memory to be used as
>> "system ram". Doing it for random device memory will not work / result
>> in undefined behavior.
> 
> No, not really. We really do not care where the memory comes from. Is it
> RAM, NVDIMM, $FOO_BAR_OF_A_FUTURE_BUZZ. We only do care that the memory
> can be onlined - user triggered associated with a zone. The memory even
> doesn't have to go to the page allocator.
> 

Yes, true, I was rather meaning if the caller wants ZONE_DEVICE, memory
block devices are never an option. So "how the memory is used" not
"where memory comes from".

add_memory() vs. devm_memremap_pages().
diff mbox series

Patch

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index e6acfa7be4c7..db509550329d 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1046,8 +1046,8 @@  int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		    bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	int flags = 0;
 
@@ -1058,6 +1058,6 @@  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 			     size, PAGE_KERNEL, pgd_pgtable_alloc, flags);
 
 	return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
-			   altmap, want_memblock);
+							restrictions);
 }
 #endif
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index e49200e31750..379eb1f9adc9 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -666,14 +666,14 @@  mem_init (void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
-	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
 	if (ret)
 		printk("%s: Problem encountered in __add_pages() as ret=%d\n",
 		       __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 1aa27aac73c5..76deaa8525db 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -109,8 +109,8 @@  int __weak remove_section_mapping(unsigned long start, unsigned long end)
 	return -ENODEV;
 }
 
-int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int __meminit arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -127,7 +127,7 @@  int __meminit arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *
 	}
 	flush_inval_dcache_range(start, start + size);
 
-	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 25e3113091ea..f5db961ad792 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -216,8 +216,8 @@  device_initcall(s390_cma_mem_init);
 
 #endif /* CONFIG_CMA */
 
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+		struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long size_pages = PFN_DOWN(size);
@@ -227,7 +227,7 @@  int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
 	if (rc)
 		return rc;
 
-	rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock);
+	rc = __add_pages(nid, start_pfn, size_pages, restrictions);
 	if (rc)
 		vmem_remove_mapping(start, size);
 	return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 8e004b2f1a6a..168d3a6b9358 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -404,15 +404,15 @@  void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 	int ret;
 
 	/* We only have ZONE_NORMAL, so this is easy.. */
-	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
 	if (unlikely(ret))
 		printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 85c94f9a87f8..755dbed85531 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -850,13 +850,13 @@  void __init mem_init(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
-	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index bccff68e3267..db42c11b48fb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -777,11 +777,11 @@  static void update_end_of_memory_vars(u64 start, u64 size)
 }
 
 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, bool want_memblock)
+				struct mhp_restrictions *restrictions)
 {
 	int ret;
 
-	ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
 	WARN_ON_ONCE(ret);
 
 	/* update max_pfn, max_low_pfn and high_memory */
@@ -791,15 +791,15 @@  int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
 	return ret;
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap,
-		bool want_memblock)
+int arch_add_memory(int nid, u64 start, u64 size,
+			struct mhp_restrictions *restrictions)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
 
 	init_memory_mapping(start, start + size);
 
-	return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 
 #define PAGE_INUSE 0xFD
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 3c8cf347804c..5bd4b56f639c 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -118,20 +118,37 @@  extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages, struct vmem_altmap *altmap);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+/*
+ * Do we want sysfs memblock files created. This will allow userspace to online
+ * and offline memory explicitly. Lack of this bit means that the caller has to
+ * call move_pfn_range_to_zone to finish the initialization.
+ */
+
+#define MHP_MEMBLOCK_API               (1<<0)
+
+/*
+ * Restrictions for the memory hotplug:
+ * flags:  MHP_ flags
+ * altmap: alternative allocator for memmap array
+ */
+struct mhp_restrictions {
+	unsigned long flags;
+	struct vmem_altmap *altmap;
+};
+
 /* reasonably generic interface to expand the physical pages */
 extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, bool want_memblock);
+					struct mhp_restrictions *restrictions);
 
 #ifndef CONFIG_ARCH_HAS_ADD_PAGES
 static inline int add_pages(int nid, unsigned long start_pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		bool want_memblock)
+		unsigned long nr_pages, struct mhp_restrictions *restrictions)
 {
-	return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock);
+	return __add_pages(nid, start_pfn, nr_pages, restrictions);
 }
 #else /* ARCH_HAS_ADD_PAGES */
 int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages,
-		struct vmem_altmap *altmap, bool want_memblock);
+				struct mhp_restrictions *restrictions);
 #endif /* ARCH_HAS_ADD_PAGES */
 
 #ifdef CONFIG_NUMA
@@ -333,7 +350,7 @@  extern int __add_memory(int nid, u64 start, u64 size);
 extern int add_memory(int nid, u64 start, u64 size);
 extern int add_memory_resource(int nid, struct resource *resource);
 extern int arch_add_memory(int nid, u64 start, u64 size,
-		struct vmem_altmap *altmap, bool want_memblock);
+			struct mhp_restrictions *restrictions);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
 		unsigned long nr_pages, struct vmem_altmap *altmap);
 extern bool is_memblock_offlined(struct memory_block *mem);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index a856cb5ff192..cc5e3e34417d 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -149,6 +149,7 @@  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	struct resource *res = &pgmap->res;
 	struct dev_pagemap *conflict_pgmap;
 	pgprot_t pgprot = PAGE_KERNEL;
+	struct mhp_restrictions restrictions = {};
 	int error, nid, is_ram;
 
 	if (!pgmap->ref || !pgmap->kill)
@@ -199,6 +200,9 @@  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	if (error)
 		goto err_pfn_remap;
 
+	/* We do not want any optional features only our own memmap */
+	restrictions.altmap = altmap;
+
 	mem_hotplug_begin();
 
 	/*
@@ -214,7 +218,7 @@  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 	 */
 	if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
 		error = add_pages(nid, align_start >> PAGE_SHIFT,
-				align_size >> PAGE_SHIFT, NULL, false);
+				align_size >> PAGE_SHIFT, &restrictions);
 	} else {
 		error = kasan_add_zero_shadow(__va(align_start), align_size);
 		if (error) {
@@ -222,8 +226,8 @@  void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
 			goto err_kasan;
 		}
 
-		error = arch_add_memory(nid, align_start, align_size, altmap,
-				false);
+		error = arch_add_memory(nid, align_start, align_size,
+							&restrictions);
 	}
 
 	if (!error) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d8a3e9554aec..50f77e059457 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -274,12 +274,12 @@  static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
  * add the new pages.
  */
 int __ref __add_pages(int nid, unsigned long phys_start_pfn,
-		unsigned long nr_pages, struct vmem_altmap *altmap,
-		bool want_memblock)
+		unsigned long nr_pages, struct mhp_restrictions *restrictions)
 {
 	unsigned long i;
 	int err = 0;
 	int start_sec, end_sec;
+	struct vmem_altmap *altmap = restrictions->altmap;
 
 	/* during initialize mem_map, align hot-added range to section */
 	start_sec = pfn_to_section_nr(phys_start_pfn);
@@ -300,7 +300,7 @@  int __ref __add_pages(int nid, unsigned long phys_start_pfn,
 
 	for (i = start_sec; i <= end_sec; i++) {
 		err = __add_section(nid, section_nr_to_pfn(i), altmap,
-				want_memblock);
+				restrictions->flags & MHP_MEMBLOCK_API);
 
 		/*
 		 * EEXIST is finally dealt with by ioresource collision
@@ -1102,6 +1102,7 @@  int __ref add_memory_resource(int nid, struct resource *res)
 	u64 start, size;
 	bool new_node = false;
 	int ret;
+	struct mhp_restrictions restrictions = {};
 
 	start = res->start;
 	size = resource_size(res);
@@ -1126,7 +1127,8 @@  int __ref add_memory_resource(int nid, struct resource *res)
 	new_node = ret;
 
 	/* call arch's memory hotadd */
-	ret = arch_add_memory(nid, start, size, NULL, true);
+	restrictions.flags = MHP_MEMBLOCK_API;
+	ret = arch_add_memory(nid, start, size, &restrictions);
 	if (ret < 0)
 		goto error;