diff mbox series

[v2] resource: Merge resources on a node when hot-adding memory

Message ID 20180806065224.31383-1-rashmica.g@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v2] resource: Merge resources on a node when hot-adding memory | expand

Commit Message

Rashmica Gupta Aug. 6, 2018, 6:52 a.m. UTC
When hot-removing memory release_mem_region_adjustable() splits
iomem resources if they are not the exact size of the memory being
hot-deleted. Adding this memory back to the kernel adds a new
resource.

Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.

When we hot-add the memory back we now have three resources:
0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.

Now if we try to remove a section of memory that overlaps these resources,
like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
expects the chunk of memory to be within the boundaries of a single
resource.

This patch adds a function request_resource_and_merge(). This is called
instead of request_resource_conflict() when registering a resource in
add_memory(). It calls request_resource_conflict() and if hot-removing is
enabled (if it isn't we won't get resource fragmentation) we attempt to
merge contiguous resources on the node.

Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
---
v1->v2: Only attempt to merge resources if hot-remove is enabled.

 include/linux/ioport.h         |   2 +
 include/linux/memory_hotplug.h |   2 +-
 kernel/resource.c              | 116 +++++++++++++++++++++++++++++++++++++++++
 mm/memory_hotplug.c            |  22 ++++----
 4 files changed, 130 insertions(+), 12 deletions(-)

Comments

Mike Rapoport Aug. 6, 2018, 2:14 p.m. UTC | #1
On Mon, Aug 06, 2018 at 04:52:24PM +1000, Rashmica Gupta wrote:
> When hot-removing memory release_mem_region_adjustable() splits
> iomem resources if they are not the exact size of the memory being
> hot-deleted. Adding this memory back to the kernel adds a new
> resource.
> 
> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
> 
> When we hot-add the memory back we now have three resources:
> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
> 
> Now if we try to remove a section of memory that overlaps these resources,
> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
> expects the chunk of memory to be within the boundaries of a single
> resource.
> 
> This patch adds a function request_resource_and_merge(). This is called
> instead of request_resource_conflict() when registering a resource in
> add_memory(). It calls request_resource_conflict() and if hot-removing is
> enabled (if it isn't we won't get resource fragmentation) we attempt to
> merge contiguous resources on the node.
> 
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ---
> v1->v2: Only attempt to merge resources if hot-remove is enabled.
> 
>  include/linux/ioport.h         |   2 +
>  include/linux/memory_hotplug.h |   2 +-
>  kernel/resource.c              | 116 +++++++++++++++++++++++++++++++++++++++++
>  mm/memory_hotplug.c            |  22 ++++----
>  4 files changed, 130 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index da0ebaec25f0..f5b93a711e86 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -189,6 +189,8 @@ extern int allocate_resource(struct resource *root, struct resource *new,
>  						       resource_size_t,
>  						       resource_size_t),
>  			     void *alignf_data);
> +extern struct resource *request_resource_and_merge(struct resource *parent,
> +						   struct resource *new, int nid);
>  struct resource *lookup_resource(struct resource *root, resource_size_t start);
>  int adjust_resource(struct resource *res, resource_size_t start,
>  		    resource_size_t size);
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 4e9828cda7a2..9c00f97c8cc6 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -322,7 +322,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
>  extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
>  		void *arg, int (*func)(struct memory_block *, void *));
>  extern int add_memory(int nid, u64 start, u64 size);
> -extern int add_memory_resource(int nid, struct resource *resource, bool online);
> +extern int add_memory_resource(int nid, u64 start, u64 size, bool online);

The add_memory_resource() function is also used by Xen balloon. It should
be updated as well.

>  extern int arch_add_memory(int nid, u64 start, u64 size,
>  		struct vmem_altmap *altmap, bool want_memblock);
>  extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 30e1bc68503b..5967061f9cea 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -1621,3 +1621,119 @@ static int __init strict_iomem(char *str)
>  }
> 
>  __setup("iomem=", strict_iomem);
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +/*
> + * Attempt to merge resource and it's sibling
> + */
> +static int merge_resources(struct resource *res)
> +{
> +	struct resource *next;
> +	struct resource *tmp;
> +	uint64_t size;
> +	int ret = -EINVAL;
> +
> +	next = res->sibling;
> +
> +	/*
> +	 * Not sure how to handle two different children. So only attempt
> +	 * to merge two resources if neither have children, only one has a
> +	 * child or if both have the same child.
> +	 */
> +	if ((res->child && next->child) && (res->child != next->child))
> +		return ret;
> +
> +	if (res->end + 1 != next->start)
> +		return ret;
> +
> +	if (res->flags != next->flags)
> +		return ret;
> +
> +	/* Update sibling and child of resource */
> +	res->sibling = next->sibling;
> +	tmp = res->child;
> +	if (!res->child)
> +		res->child = next->child;
> +
> +	size = next->end - res->start + 1;
> +	ret = __adjust_resource(res, res->start, size);
> +	if (ret) {
> +		/* Failed so restore resource to original state */
> +		res->sibling = next;
> +		res->child = tmp;
> +		return ret;
> +	}
> +
> +	free_resource(next);
> +
> +	return ret;
> +}
> +
> +/*
> + * Attempt to merge resources on the node
> + */
> +static void merge_node_resources(int nid, struct resource *parent)
> +{
> +	struct resource *res;
> +	uint64_t start_addr;
> +	uint64_t end_addr;
> +	int ret;
> +
> +	start_addr = node_start_pfn(nid) << PAGE_SHIFT;
> +	end_addr = node_end_pfn(nid) << PAGE_SHIFT;
> +
> +	write_lock(&resource_lock);
> +
> +	/* Get the first resource */
> +	res = parent->child;
> +
> +	while (res) {
> +		/* Check that the resource is within the node */
> +		if (res->start < start_addr) {
> +			res = res->sibling;
> +			continue;
> +		}
> +		/* Exit if resource is past end of node */
> +		if (res->sibling->end > end_addr)
> +			break;
> +
> +		ret = merge_resources(res);
> +		if (!ret)
> +			continue;
> +		res = res->sibling;
> +	}
> +	write_unlock(&resource_lock);
> +}
> +#endif /* CONFIG_MEMORY_HOTREMOVE */
> +
> +/**
> + * request_resource_and_merge() - request an I/O or memory resource for hot-add
> + * @parent: parent resource descriptor
> + * @new: resource descriptor desired by caller
> + * @nid: node id of the node we want the resource on
> + *
> + * Returns NULL for success and conflict resource on error.
> + * If no conflict resource then attempt to merge resources on the node.

Please put the return value description at the end and start it with
'Return:' rather than 'Returns'

> + *
> + * This is intended to cleanup the fragmentation of resources that occurs when
> + * hot-removing memory (see release_mem_region_adjustable). If hot-removing is
> + * not enabled then there is no point trying to merge resources.

I think it's worth noting that inability to merge the resources is not an
error.

> + */
> +struct resource *request_resource_and_merge(struct resource *parent,
> +					    struct resource *new, int nid)
> +{
> +	struct resource *conflict;
> +
> +	conflict = request_resource_conflict(parent, new);
> +
> +	if (conflict)
> +		return conflict;
> +
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +	merge_node_resources(nid, parent);
> +#endif /* CONFIG_MEMORY_HOTREMOVE */
> +
> +	return NULL;
> +}
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 7deb49f69e27..2e342f5ce322 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -97,7 +97,7 @@ void mem_hotplug_done(void)
>  }
> 
>  /* add this memory to iomem resource */
> -static struct resource *register_memory_resource(u64 start, u64 size)
> +static struct resource *register_memory_resource(int nid, u64 start, u64 size)
>  {
>  	struct resource *res, *conflict;
>  	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
> @@ -108,7 +108,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
>  	res->start = start;
>  	res->end = start + size - 1;
>  	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> -	conflict =  request_resource_conflict(&iomem_resource, res);
> +	conflict =  request_resource_and_merge(&iomem_resource, res, nid);
>  	if (conflict) {
>  		if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
>  			pr_debug("Device unaddressable memory block "
> @@ -122,11 +122,15 @@ static struct resource *register_memory_resource(u64 start, u64 size)
>  	return res;
>  }
> 
> -static void release_memory_resource(struct resource *res)
> +static void release_memory_resource(struct resource *res, u64 start, u64 size)
>  {
>  	if (!res)
>  		return;
> +#ifdef CONFIG_MEMORY_HOTREMOVE
> +	release_mem_region_adjustable(&iomem_resource, start, size);
> +#else
>  	release_resource(res);
> +#endif
>  	kfree(res);
>  	return;
>  }
> @@ -1096,17 +1100,13 @@ static int online_memory_block(struct memory_block *mem, void *arg)
>  }
> 
>  /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
> -int __ref add_memory_resource(int nid, struct resource *res, bool online)
> +int __ref add_memory_resource(int nid, u64 start, u64 size, bool online)
>  {
> -	u64 start, size;
>  	pg_data_t *pgdat = NULL;
>  	bool new_pgdat;
>  	bool new_node;
>  	int ret;
> 
> -	start = res->start;
> -	size = resource_size(res);
> -
>  	ret = check_hotplug_memory_range(start, size);
>  	if (ret)
>  		return ret;
> @@ -1195,13 +1195,13 @@ int __ref add_memory(int nid, u64 start, u64 size)
>  	struct resource *res;
>  	int ret;
> 
> -	res = register_memory_resource(start, size);
> +	res = register_memory_resource(nid, start, size);
>  	if (IS_ERR(res))
>  		return PTR_ERR(res);
> 
> -	ret = add_memory_resource(nid, res, memhp_auto_online);
> +	ret = add_memory_resource(nid, start, size, memhp_auto_online);
>  	if (ret < 0)
> -		release_memory_resource(res);
> +		release_memory_resource(res, start, size);
>  	return ret;
>  }
>  EXPORT_SYMBOL_GPL(add_memory);
> -- 
> 2.14.4
>
Rashmica Gupta Aug. 6, 2018, 11:59 p.m. UTC | #2
On 07/08/18 00:14, Mike Rapoport wrote:

> On Mon, Aug 06, 2018 at 04:52:24PM +1000, Rashmica Gupta wrote:
>> When hot-removing memory release_mem_region_adjustable() splits
>> iomem resources if they are not the exact size of the memory being
>> hot-deleted. Adding this memory back to the kernel adds a new
>> resource.
>>
>> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
>> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
>> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
>>
>> When we hot-add the memory back we now have three resources:
>> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
>>
>> Now if we try to remove a section of memory that overlaps these resources,
>> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
>> expects the chunk of memory to be within the boundaries of a single
>> resource.
>>
>> This patch adds a function request_resource_and_merge(). This is called
>> instead of request_resource_conflict() when registering a resource in
>> add_memory(). It calls request_resource_conflict() and if hot-removing is
>> enabled (if it isn't we won't get resource fragmentation) we attempt to
>> merge contiguous resources on the node.
>>
>> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
>> ---
>> v1->v2: Only attempt to merge resources if hot-remove is enabled.
>>
>>  include/linux/ioport.h         |   2 +
>>  include/linux/memory_hotplug.h |   2 +-
>>  kernel/resource.c              | 116 +++++++++++++++++++++++++++++++++++++++++
>>  mm/memory_hotplug.c            |  22 ++++----
>>  4 files changed, 130 insertions(+), 12 deletions(-)
>>
>> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
>> index da0ebaec25f0..f5b93a711e86 100644
>> --- a/include/linux/ioport.h
>> +++ b/include/linux/ioport.h
>> @@ -189,6 +189,8 @@ extern int allocate_resource(struct resource *root, struct resource *new,
>>  						       resource_size_t,
>>  						       resource_size_t),
>>  			     void *alignf_data);
>> +extern struct resource *request_resource_and_merge(struct resource *parent,
>> +						   struct resource *new, int nid);
>>  struct resource *lookup_resource(struct resource *root, resource_size_t start);
>>  int adjust_resource(struct resource *res, resource_size_t start,
>>  		    resource_size_t size);
>> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>> index 4e9828cda7a2..9c00f97c8cc6 100644
>> --- a/include/linux/memory_hotplug.h
>> +++ b/include/linux/memory_hotplug.h
>> @@ -322,7 +322,7 @@ static inline void remove_memory(int nid, u64 start, u64 size) {}
>>  extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
>>  		void *arg, int (*func)(struct memory_block *, void *));
>>  extern int add_memory(int nid, u64 start, u64 size);
>> -extern int add_memory_resource(int nid, struct resource *resource, bool online);
>> +extern int add_memory_resource(int nid, u64 start, u64 size, bool online);
> The add_memory_resource() function is also used by Xen balloon. It should
> be updated as well.

Thanks for spotting that.

>>  extern int arch_add_memory(int nid, u64 start, u64 size,
>>  		struct vmem_altmap *altmap, bool want_memblock);
>>  extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
>> diff --git a/kernel/resource.c b/kernel/resource.c
>> index 30e1bc68503b..5967061f9cea 100644
>> --- a/kernel/resource.c
>> +++ b/kernel/resource.c
>> @@ -1621,3 +1621,119 @@ static int __init strict_iomem(char *str)
>>  }
>>
>>  __setup("iomem=", strict_iomem);
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +/*
>> + * Attempt to merge resource and it's sibling
>> + */
>> +static int merge_resources(struct resource *res)
>> +{
>> +	struct resource *next;
>> +	struct resource *tmp;
>> +	uint64_t size;
>> +	int ret = -EINVAL;
>> +
>> +	next = res->sibling;
>> +
>> +	/*
>> +	 * Not sure how to handle two different children. So only attempt
>> +	 * to merge two resources if neither have children, only one has a
>> +	 * child or if both have the same child.
>> +	 */
>> +	if ((res->child && next->child) && (res->child != next->child))
>> +		return ret;
>> +
>> +	if (res->end + 1 != next->start)
>> +		return ret;
>> +
>> +	if (res->flags != next->flags)
>> +		return ret;
>> +
>> +	/* Update sibling and child of resource */
>> +	res->sibling = next->sibling;
>> +	tmp = res->child;
>> +	if (!res->child)
>> +		res->child = next->child;
>> +
>> +	size = next->end - res->start + 1;
>> +	ret = __adjust_resource(res, res->start, size);
>> +	if (ret) {
>> +		/* Failed so restore resource to original state */
>> +		res->sibling = next;
>> +		res->child = tmp;
>> +		return ret;
>> +	}
>> +
>> +	free_resource(next);
>> +
>> +	return ret;
>> +}
>> +
>> +/*
>> + * Attempt to merge resources on the node
>> + */
>> +static void merge_node_resources(int nid, struct resource *parent)
>> +{
>> +	struct resource *res;
>> +	uint64_t start_addr;
>> +	uint64_t end_addr;
>> +	int ret;
>> +
>> +	start_addr = node_start_pfn(nid) << PAGE_SHIFT;
>> +	end_addr = node_end_pfn(nid) << PAGE_SHIFT;
>> +
>> +	write_lock(&resource_lock);
>> +
>> +	/* Get the first resource */
>> +	res = parent->child;
>> +
>> +	while (res) {
>> +		/* Check that the resource is within the node */
>> +		if (res->start < start_addr) {
>> +			res = res->sibling;
>> +			continue;
>> +		}
>> +		/* Exit if resource is past end of node */
>> +		if (res->sibling->end > end_addr)
>> +			break;
>> +
>> +		ret = merge_resources(res);
>> +		if (!ret)
>> +			continue;
>> +		res = res->sibling;
>> +	}
>> +	write_unlock(&resource_lock);
>> +}
>> +#endif /* CONFIG_MEMORY_HOTREMOVE */
>> +
>> +/**
>> + * request_resource_and_merge() - request an I/O or memory resource for hot-add
>> + * @parent: parent resource descriptor
>> + * @new: resource descriptor desired by caller
>> + * @nid: node id of the node we want the resource on
>> + *
>> + * Returns NULL for success and conflict resource on error.
>> + * If no conflict resource then attempt to merge resources on the node.
> Please put the return value description at the end and start it with
> 'Return:' rather than 'Returns'

Will do.

>> + *
>> + * This is intended to cleanup the fragmentation of resources that occurs when
>> + * hot-removing memory (see release_mem_region_adjustable). If hot-removing is
>> + * not enabled then there is no point trying to merge resources.
> I think it's worth noting that inability to merge the resources is not an
> error.

Good point.

>> + */
>> +struct resource *request_resource_and_merge(struct resource *parent,
>> +					    struct resource *new, int nid)
>> +{
>> +	struct resource *conflict;
>> +
>> +	conflict = request_resource_conflict(parent, new);
>> +
>> +	if (conflict)
>> +		return conflict;
>> +
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +	merge_node_resources(nid, parent);
>> +#endif /* CONFIG_MEMORY_HOTREMOVE */
>> +
>> +	return NULL;
>> +}
>> +#endif /* CONFIG_MEMORY_HOTPLUG */
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 7deb49f69e27..2e342f5ce322 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -97,7 +97,7 @@ void mem_hotplug_done(void)
>>  }
>>
>>  /* add this memory to iomem resource */
>> -static struct resource *register_memory_resource(u64 start, u64 size)
>> +static struct resource *register_memory_resource(int nid, u64 start, u64 size)
>>  {
>>  	struct resource *res, *conflict;
>>  	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
>> @@ -108,7 +108,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
>>  	res->start = start;
>>  	res->end = start + size - 1;
>>  	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
>> -	conflict =  request_resource_conflict(&iomem_resource, res);
>> +	conflict =  request_resource_and_merge(&iomem_resource, res, nid);
>>  	if (conflict) {
>>  		if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
>>  			pr_debug("Device unaddressable memory block "
>> @@ -122,11 +122,15 @@ static struct resource *register_memory_resource(u64 start, u64 size)
>>  	return res;
>>  }
>>
>> -static void release_memory_resource(struct resource *res)
>> +static void release_memory_resource(struct resource *res, u64 start, u64 size)
>>  {
>>  	if (!res)
>>  		return;
>> +#ifdef CONFIG_MEMORY_HOTREMOVE
>> +	release_mem_region_adjustable(&iomem_resource, start, size);
>> +#else
>>  	release_resource(res);
>> +#endif
>>  	kfree(res);
>>  	return;
>>  }
>> @@ -1096,17 +1100,13 @@ static int online_memory_block(struct memory_block *mem, void *arg)
>>  }
>>
>>  /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
>> -int __ref add_memory_resource(int nid, struct resource *res, bool online)
>> +int __ref add_memory_resource(int nid, u64 start, u64 size, bool online)
>>  {
>> -	u64 start, size;
>>  	pg_data_t *pgdat = NULL;
>>  	bool new_pgdat;
>>  	bool new_node;
>>  	int ret;
>>
>> -	start = res->start;
>> -	size = resource_size(res);
>> -
>>  	ret = check_hotplug_memory_range(start, size);
>>  	if (ret)
>>  		return ret;
>> @@ -1195,13 +1195,13 @@ int __ref add_memory(int nid, u64 start, u64 size)
>>  	struct resource *res;
>>  	int ret;
>>
>> -	res = register_memory_resource(start, size);
>> +	res = register_memory_resource(nid, start, size);
>>  	if (IS_ERR(res))
>>  		return PTR_ERR(res);
>>
>> -	ret = add_memory_resource(nid, res, memhp_auto_online);
>> +	ret = add_memory_resource(nid, start, size, memhp_auto_online);
>>  	if (ret < 0)
>> -		release_memory_resource(res);
>> +		release_memory_resource(res, start, size);
>>  	return ret;
>>  }
>>  EXPORT_SYMBOL_GPL(add_memory);
>> -- 
>> 2.14.4
>>
Vlastimil Babka Aug. 7, 2018, 11:52 a.m. UTC | #3
On 08/06/2018 08:52 AM, Rashmica Gupta wrote:
> When hot-removing memory release_mem_region_adjustable() splits
> iomem resources if they are not the exact size of the memory being
> hot-deleted. Adding this memory back to the kernel adds a new
> resource.
> 
> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
> 
> When we hot-add the memory back we now have three resources:
> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
> 
> Now if we try to remove a section of memory that overlaps these resources,
> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
> expects the chunk of memory to be within the boundaries of a single
> resource.

Hi,

it's the first time I see the resource code, so I might be easily wrong.
How can it happen that the second remove is section aligned but the
first one not?

> This patch adds a function request_resource_and_merge(). This is called
> instead of request_resource_conflict() when registering a resource in
> add_memory(). It calls request_resource_conflict() and if hot-removing is
> enabled (if it isn't we won't get resource fragmentation) we attempt to
> merge contiguous resources on the node.
> 
> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
...
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
...
> +/*
> + * Attempt to merge resources on the node
> + */
> +static void merge_node_resources(int nid, struct resource *parent)
> +{
> +	struct resource *res;
> +	uint64_t start_addr;
> +	uint64_t end_addr;
> +	int ret;
> +
> +	start_addr = node_start_pfn(nid) << PAGE_SHIFT;
> +	end_addr = node_end_pfn(nid) << PAGE_SHIFT;
> +
> +	write_lock(&resource_lock);
> +
> +	/* Get the first resource */
> +	res = parent->child;
> +
> +	while (res) {
> +		/* Check that the resource is within the node */
> +		if (res->start < start_addr) {
> +			res = res->sibling;
> +			continue;
> +		}
> +		/* Exit if resource is past end of node */
> +		if (res->sibling->end > end_addr)
> +			break;

IIUC, resource end is closed, so adjacent resources's start is end+1.
But node_end_pfn is open, so the comparison above should use '>='
instead of '>'?

> +
> +		ret = merge_resources(res);
> +		if (!ret)
> +			continue;
> +		res = res->sibling;

Should this rather use next_resource() to merge at all levels of the
hierarchy? Although memory seems to be flat under &iomem_resource so it
would be just future-proofing.

Thanks,
Vlastimil
Rashmica Gupta Aug. 8, 2018, 5:44 a.m. UTC | #4
On Tue, Aug 7, 2018 at 9:52 PM, Vlastimil Babka <vbabka@suse.cz> wrote:
> On 08/06/2018 08:52 AM, Rashmica Gupta wrote:
>> When hot-removing memory release_mem_region_adjustable() splits
>> iomem resources if they are not the exact size of the memory being
>> hot-deleted. Adding this memory back to the kernel adds a new
>> resource.
>>
>> Eg a node has memory 0x0 - 0xfffffffff. Offlining and hot-removing
>> 1GB from 0xf40000000 results in the single resource 0x0-0xfffffffff being
>> split into two resources: 0x0-0xf3fffffff and 0xf80000000-0xfffffffff.
>>
>> When we hot-add the memory back we now have three resources:
>> 0x0-0xf3fffffff, 0xf40000000-0xf7fffffff, and 0xf80000000-0xfffffffff.
>>
>> Now if we try to remove a section of memory that overlaps these resources,
>> like 2GB from 0xf40000000, release_mem_region_adjustable() fails as it
>> expects the chunk of memory to be within the boundaries of a single
>> resource.
>
> Hi,
>
> it's the first time I see the resource code, so I might be easily wrong.
> How can it happen that the second remove is section aligned but the
> first one not?
>

I probably shouldn't have used that word... When I said "a section of memory"
I really meant "a chunk of memory" or "some memory".


>> This patch adds a function request_resource_and_merge(). This is called
>> instead of request_resource_conflict() when registering a resource in
>> add_memory(). It calls request_resource_conflict() and if hot-removing is
>> enabled (if it isn't we won't get resource fragmentation) we attempt to
>> merge contiguous resources on the node.
>>
>> Signed-off-by: Rashmica Gupta <rashmica.g@gmail.com>
> ...
>> --- a/kernel/resource.c
>> +++ b/kernel/resource.c
> ...
>> +/*
>> + * Attempt to merge resources on the node
>> + */
>> +static void merge_node_resources(int nid, struct resource *parent)
>> +{
>> +     struct resource *res;
>> +     uint64_t start_addr;
>> +     uint64_t end_addr;
>> +     int ret;
>> +
>> +     start_addr = node_start_pfn(nid) << PAGE_SHIFT;
>> +     end_addr = node_end_pfn(nid) << PAGE_SHIFT;
>> +
>> +     write_lock(&resource_lock);
>> +
>> +     /* Get the first resource */
>> +     res = parent->child;
>> +
>> +     while (res) {
>> +             /* Check that the resource is within the node */
>> +             if (res->start < start_addr) {
>> +                     res = res->sibling;
>> +                     continue;
>> +             }
>> +             /* Exit if resource is past end of node */
>> +             if (res->sibling->end > end_addr)
>> +                     break;
>
> IIUC, resource end is closed, so adjacent resources's start is end+1.
> But node_end_pfn is open, so the comparison above should use '>='
> instead of '>'?

You are right. Thanks for spotting that.

>
>> +
>> +             ret = merge_resources(res);
>> +             if (!ret)
>> +                     continue;
>> +             res = res->sibling;
>
> Should this rather use next_resource() to merge at all levels of the
> hierarchy? Although memory seems to be flat under &iomem_resource so it
> would be just future-proofing.

I don't know enough about the hierarchy and layout of resources to comment on
this.

>
> Thanks,
> Vlastimil
diff mbox series

Patch

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index da0ebaec25f0..f5b93a711e86 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -189,6 +189,8 @@  extern int allocate_resource(struct resource *root, struct resource *new,
 						       resource_size_t,
 						       resource_size_t),
 			     void *alignf_data);
+extern struct resource *request_resource_and_merge(struct resource *parent,
+						   struct resource *new, int nid);
 struct resource *lookup_resource(struct resource *root, resource_size_t start);
 int adjust_resource(struct resource *res, resource_size_t start,
 		    resource_size_t size);
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 4e9828cda7a2..9c00f97c8cc6 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -322,7 +322,7 @@  static inline void remove_memory(int nid, u64 start, u64 size) {}
 extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
 		void *arg, int (*func)(struct memory_block *, void *));
 extern int add_memory(int nid, u64 start, u64 size);
-extern int add_memory_resource(int nid, struct resource *resource, bool online);
+extern int add_memory_resource(int nid, u64 start, u64 size, bool online);
 extern int arch_add_memory(int nid, u64 start, u64 size,
 		struct vmem_altmap *altmap, bool want_memblock);
 extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
diff --git a/kernel/resource.c b/kernel/resource.c
index 30e1bc68503b..5967061f9cea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1621,3 +1621,119 @@  static int __init strict_iomem(char *str)
 }
 
 __setup("iomem=", strict_iomem);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * Attempt to merge resource and it's sibling
+ */
+static int merge_resources(struct resource *res)
+{
+	struct resource *next;
+	struct resource *tmp;
+	uint64_t size;
+	int ret = -EINVAL;
+
+	next = res->sibling;
+
+	/*
+	 * Not sure how to handle two different children. So only attempt
+	 * to merge two resources if neither have children, only one has a
+	 * child or if both have the same child.
+	 */
+	if ((res->child && next->child) && (res->child != next->child))
+		return ret;
+
+	if (res->end + 1 != next->start)
+		return ret;
+
+	if (res->flags != next->flags)
+		return ret;
+
+	/* Update sibling and child of resource */
+	res->sibling = next->sibling;
+	tmp = res->child;
+	if (!res->child)
+		res->child = next->child;
+
+	size = next->end - res->start + 1;
+	ret = __adjust_resource(res, res->start, size);
+	if (ret) {
+		/* Failed so restore resource to original state */
+		res->sibling = next;
+		res->child = tmp;
+		return ret;
+	}
+
+	free_resource(next);
+
+	return ret;
+}
+
+/*
+ * Attempt to merge resources on the node
+ */
+static void merge_node_resources(int nid, struct resource *parent)
+{
+	struct resource *res;
+	uint64_t start_addr;
+	uint64_t end_addr;
+	int ret;
+
+	start_addr = node_start_pfn(nid) << PAGE_SHIFT;
+	end_addr = node_end_pfn(nid) << PAGE_SHIFT;
+
+	write_lock(&resource_lock);
+
+	/* Get the first resource */
+	res = parent->child;
+
+	while (res) {
+		/* Check that the resource is within the node */
+		if (res->start < start_addr) {
+			res = res->sibling;
+			continue;
+		}
+		/* Exit if resource is past end of node */
+		if (res->sibling->end > end_addr)
+			break;
+
+		ret = merge_resources(res);
+		if (!ret)
+			continue;
+		res = res->sibling;
+	}
+	write_unlock(&resource_lock);
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+/**
+ * request_resource_and_merge() - request an I/O or memory resource for hot-add
+ * @parent: parent resource descriptor
+ * @new: resource descriptor desired by caller
+ * @nid: node id of the node we want the resource on
+ *
+ * Returns NULL for success and conflict resource on error.
+ * If no conflict resource then attempt to merge resources on the node.
+ *
+ * This is intended to cleanup the fragmentation of resources that occurs when
+ * hot-removing memory (see release_mem_region_adjustable). If hot-removing is
+ * not enabled then there is no point trying to merge resources.
+ */
+struct resource *request_resource_and_merge(struct resource *parent,
+					    struct resource *new, int nid)
+{
+	struct resource *conflict;
+
+	conflict = request_resource_conflict(parent, new);
+
+	if (conflict)
+		return conflict;
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+	merge_node_resources(nid, parent);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+	return NULL;
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7deb49f69e27..2e342f5ce322 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -97,7 +97,7 @@  void mem_hotplug_done(void)
 }
 
 /* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(int nid, u64 start, u64 size)
 {
 	struct resource *res, *conflict;
 	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
@@ -108,7 +108,7 @@  static struct resource *register_memory_resource(u64 start, u64 size)
 	res->start = start;
 	res->end = start + size - 1;
 	res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-	conflict =  request_resource_conflict(&iomem_resource, res);
+	conflict =  request_resource_and_merge(&iomem_resource, res, nid);
 	if (conflict) {
 		if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
 			pr_debug("Device unaddressable memory block "
@@ -122,11 +122,15 @@  static struct resource *register_memory_resource(u64 start, u64 size)
 	return res;
 }
 
-static void release_memory_resource(struct resource *res)
+static void release_memory_resource(struct resource *res, u64 start, u64 size)
 {
 	if (!res)
 		return;
+#ifdef CONFIG_MEMORY_HOTREMOVE
+	release_mem_region_adjustable(&iomem_resource, start, size);
+#else
 	release_resource(res);
+#endif
 	kfree(res);
 	return;
 }
@@ -1096,17 +1100,13 @@  static int online_memory_block(struct memory_block *mem, void *arg)
 }
 
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+int __ref add_memory_resource(int nid, u64 start, u64 size, bool online)
 {
-	u64 start, size;
 	pg_data_t *pgdat = NULL;
 	bool new_pgdat;
 	bool new_node;
 	int ret;
 
-	start = res->start;
-	size = resource_size(res);
-
 	ret = check_hotplug_memory_range(start, size);
 	if (ret)
 		return ret;
@@ -1195,13 +1195,13 @@  int __ref add_memory(int nid, u64 start, u64 size)
 	struct resource *res;
 	int ret;
 
-	res = register_memory_resource(start, size);
+	res = register_memory_resource(nid, start, size);
 	if (IS_ERR(res))
 		return PTR_ERR(res);
 
-	ret = add_memory_resource(nid, res, memhp_auto_online);
+	ret = add_memory_resource(nid, start, size, memhp_auto_online);
 	if (ret < 0)
-		release_memory_resource(res);
+		release_memory_resource(res, start, size);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);