diff mbox series

[v4,11/23] device-dax: Kill dax_kmem_res

Message ID 159643100485.4062302.976628339798536960.stgit@dwillia2-desk3.amr.corp.intel.com
State New, archived
Headers show
Series device-dax: Support sub-dividing soft-reserved ranges | expand

Commit Message

Dan Williams Aug. 3, 2020, 5:03 a.m. UTC
Several related issues around this unneeded attribute:

- The dax_kmem_res property allows the kmem driver to stash the adjusted
  resource range that was used for the hotplug operation, but that can be
  recalculated from the original base range.

- kmem is using an open coded release_resource() + kfree() when an
  idiomatic release_mem_region() is sufficient.

- The driver managed resource need only manage the busy flag. Other flags
  are of no concern to the kmem driver. In fact if kmem inherits some
  memory range that add_memory_driver_managed() rejects that is a
  memory-hotplug-core policy that the driver is in no position to
  override.

- The implementation trusts that failed remove_memory() results in the
  entire resource range remaining pinned busy. The driver need not make
  that layering violation assumption and just maintain the busy state in
  its local resource.

- The "Hot-remove not yet implemented." comment is stale since hotremove
  support is now included.

Cc: David Hildenbrand <david@redhat.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Pavel Tatashin <pasha.tatashin@soleen.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/dax-private.h |    3 -
 drivers/dax/kmem.c        |  123 +++++++++++++++++++++------------------------
 2 files changed, 58 insertions(+), 68 deletions(-)

Comments

David Hildenbrand Aug. 21, 2020, 10:06 a.m. UTC | #1
On 03.08.20 07:03, Dan Williams wrote:
> Several related issues around this unneeded attribute:
> 
> - The dax_kmem_res property allows the kmem driver to stash the adjusted
>   resource range that was used for the hotplug operation, but that can be
>   recalculated from the original base range.
> 
> - kmem is using an open coded release_resource() + kfree() when an
>   idiomatic release_mem_region() is sufficient.
> 
> - The driver managed resource need only manage the busy flag. Other flags
>   are of no concern to the kmem driver. In fact if kmem inherits some
>   memory range that add_memory_driver_managed() rejects that is a
>   memory-hotplug-core policy that the driver is in no position to
>   override.
> 
> - The implementation trusts that failed remove_memory() results in the
>   entire resource range remaining pinned busy. The driver need not make
>   that layering violation assumption and just maintain the busy state in
>   its local resource.
> 
> - The "Hot-remove not yet implemented." comment is stale since hotremove
>   support is now included.

I think some of these changes could have been nicely split out to
simplify reviewing. E.g., comment update, release_mem_region(),  &=
~IORESOURCE_BUSY ...

[...]

> +
>  int dev_dax_kmem_probe(struct device *dev)
>  {
>  	struct dev_dax *dev_dax = to_dev_dax(dev);
> -	struct range *range = &dev_dax->range;
> -	resource_size_t kmem_start;
> -	resource_size_t kmem_size;
> -	resource_size_t kmem_end;
> -	struct resource *new_res;
> -	const char *new_res_name;
> -	int numa_node;
> +	struct range range = dax_kmem_range(dev_dax);
> +	int numa_node = dev_dax->target_node;
> +	struct resource *res;
> +	char *res_name;
>  	int rc;
>  
>  	/*
> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
>  	 * could be mixed in a node with faster memory, causing
>  	 * unavoidable performance issues.
>  	 */
> -	numa_node = dev_dax->target_node;
>  	if (numa_node < 0) {
>  		dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
>  				numa_node);
>  		return -EINVAL;
>  	}
>  
> -	/* Hotplug starting at the beginning of the next block: */
> -	kmem_start = ALIGN(range->start, memory_block_size_bytes());
> -
> -	kmem_size = range_len(range);
> -	/* Adjust the size down to compensate for moving up kmem_start: */
> -	kmem_size -= kmem_start - range->start;
> -	/* Align the size down to cover only complete blocks: */
> -	kmem_size &= ~(memory_block_size_bytes() - 1);
> -	kmem_end = kmem_start + kmem_size;
> -
> -	new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> -	if (!new_res_name)
> +	res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> +	if (!res_name)
>  		return -ENOMEM;
>  
> -	/* Region is permanently reserved if hotremove fails. */
> -	new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
> -	if (!new_res) {
> -		dev_warn(dev, "could not reserve region [%pa-%pa]\n",
> -			 &kmem_start, &kmem_end);
> -		kfree(new_res_name);
> +	res = request_mem_region(range.start, range_len(&range), res_name);

I think our range could be empty after aligning. I assume
request_mem_region() would check that, but maybe we could report a
better error/warning in that case.

> +	if (!res) {
> +		dev_warn(dev, "could not reserve region [%#llx-%#llx]\n",
> +				range.start, range.end);
> +		kfree(res_name);
>  		return -EBUSY;
>  	}
>  
>  	/*
> -	 * Set flags appropriate for System RAM.  Leave ..._BUSY clear
> -	 * so that add_memory() can add a child resource.  Do not
> -	 * inherit flags from the parent since it may set new flags
> -	 * unknown to us that will break add_memory() below.
> +	 * Temporarily clear busy to allow add_memory_driver_managed()
> +	 * to claim it.
>  	 */
> -	new_res->flags = IORESOURCE_SYSTEM_RAM;
> +	res->flags &= ~IORESOURCE_BUSY;

Right, same approach is taken by virtio-mem.

>  
>  	/*
>  	 * Ensure that future kexec'd kernels will not treat this as RAM
>  	 * automatically.
>  	 */
> -	rc = add_memory_driver_managed(numa_node, new_res->start,
> -				       resource_size(new_res), kmem_name);
> +	rc = add_memory_driver_managed(numa_node, res->start,
> +				       resource_size(res), kmem_name);
> +
> +	res->flags |= IORESOURCE_BUSY;

Hm, I don't think that's correct. Any specific reason why to mark the
not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
suddenly stumble over it - and e.g., similarly kexec code when trying to
find memory for placing kexec images. I think we should leave this
!BUSY, just as it is right now.

>  	if (rc) {
> -		release_resource(new_res);
> -		kfree(new_res);
> -		kfree(new_res_name);
> +		release_mem_region(range.start, range_len(&range));
> +		kfree(res_name);
>  		return rc;
>  	}
> -	dev_dax->dax_kmem_res = new_res;
> +
> +	dev_set_drvdata(dev, res_name);
>  
>  	return 0;
>  }
>  
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> -static int dev_dax_kmem_remove(struct device *dev)
> +static void dax_kmem_release(struct dev_dax *dev_dax)
>  {
> -	struct dev_dax *dev_dax = to_dev_dax(dev);
> -	struct resource *res = dev_dax->dax_kmem_res;
> -	resource_size_t kmem_start = res->start;
> -	resource_size_t kmem_size = resource_size(res);
> -	const char *res_name = res->name;
>  	int rc;
> +	struct device *dev = &dev_dax->dev;
> +	const char *res_name = dev_get_drvdata(dev);
> +	struct range range = dax_kmem_range(dev_dax);
>  
>  	/*
>  	 * We have one shot for removing memory, if some memory blocks were not
>  	 * offline prior to calling this function remove_memory() will fail, and
>  	 * there is no way to hotremove this memory until reboot because device
> -	 * unbind will succeed even if we return failure.
> +	 * unbind will proceed regardless of the remove_memory result.
>  	 */
> -	rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
> -	if (rc) {
> -		any_hotremove_failed = true;
> -		dev_err(dev,
> -			"DAX region %pR cannot be hotremoved until the next reboot\n",
> -			res);
> -		return rc;
> +	rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
> +	if (rc == 0) {

if (!rc) ?

> +		release_mem_region(range.start, range_len(&range));

remove_memory() does a release_mem_region_adjustable(). Don't you
actually want to release the *unaligned* region you requested?

> +		dev_set_drvdata(dev, NULL);
> +		kfree(res_name);
> +		return;
>  	}

Not sure if inverting the error handling improved the code / review here.
>  
> -	/* Release and free dax resources */
> -	release_resource(res);
> -	kfree(res);
> -	kfree(res_name);
> -	dev_dax->dax_kmem_res = NULL;
> -
> -	return 0;
> +	any_hotremove_failed = true;
> +	dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
> +			range.start, range.end);
>  }
>  #else
> -static int dev_dax_kmem_remove(struct device *dev)
> +static void dax_kmem_release(struct dev_dax *dev_dax)
>  {
>  	/*
> -	 * Without hotremove purposely leak the request_mem_region() for the
> -	 * device-dax range and return '0' to ->remove() attempts. The removal
> -	 * of the device from the driver always succeeds, but the region is
> -	 * permanently pinned as reserved by the unreleased
> -	 * request_mem_region().
> +	 * Without hotremove purposely leak the request_mem_region() for
> +	 * the device-dax range attempts. The removal of the device from
> +	 * the driver always succeeds, but the region is permanently
> +	 * pinned as reserved by the unreleased request_mem_region().
>  	 */
>  	any_hotremove_failed = true;
> -	return 0;
>  }
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
> +static int dev_dax_kmem_remove(struct device *dev)
> +{
> +	dax_kmem_release(to_dev_dax(dev));
> +	return 0;
> +}
> +
>  static struct dax_device_driver device_dax_kmem_driver = {
>  	.drv = {
>  		.probe = dev_dax_kmem_probe,
> 

Maybe split some of these changes out. Would at least help me to review ;)
Joao Martins Sept. 8, 2020, 3:33 p.m. UTC | #2
[Sorry for the late response]

On 8/21/20 11:06 AM, David Hildenbrand wrote:
> On 03.08.20 07:03, Dan Williams wrote:
>> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
>>  	 * could be mixed in a node with faster memory, causing
>>  	 * unavoidable performance issues.
>>  	 */
>> -	numa_node = dev_dax->target_node;
>>  	if (numa_node < 0) {
>>  		dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
>>  				numa_node);
>>  		return -EINVAL;
>>  	}
>>  
>> -	/* Hotplug starting at the beginning of the next block: */
>> -	kmem_start = ALIGN(range->start, memory_block_size_bytes());
>> -
>> -	kmem_size = range_len(range);
>> -	/* Adjust the size down to compensate for moving up kmem_start: */
>> -	kmem_size -= kmem_start - range->start;
>> -	/* Align the size down to cover only complete blocks: */
>> -	kmem_size &= ~(memory_block_size_bytes() - 1);
>> -	kmem_end = kmem_start + kmem_size;
>> -
>> -	new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>> -	if (!new_res_name)
>> +	res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>> +	if (!res_name)
>>  		return -ENOMEM;
>>  
>> -	/* Region is permanently reserved if hotremove fails. */
>> -	new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
>> -	if (!new_res) {
>> -		dev_warn(dev, "could not reserve region [%pa-%pa]\n",
>> -			 &kmem_start, &kmem_end);
>> -		kfree(new_res_name);
>> +	res = request_mem_region(range.start, range_len(&range), res_name);
> 
> I think our range could be empty after aligning. I assume
> request_mem_region() would check that, but maybe we could report a
> better error/warning in that case.
> 
dax_kmem_range() already returns a memory-block-aligned @range but
IIUC request_mem_region() isn't checking for that. Having said that
the returned @res wouldn't be different from the passed range.start.

>>  	/*
>>  	 * Ensure that future kexec'd kernels will not treat this as RAM
>>  	 * automatically.
>>  	 */
>> -	rc = add_memory_driver_managed(numa_node, new_res->start,
>> -				       resource_size(new_res), kmem_name);
>> +	rc = add_memory_driver_managed(numa_node, res->start,
>> +				       resource_size(res), kmem_name);
>> +
>> +	res->flags |= IORESOURCE_BUSY;
> 
> Hm, I don't think that's correct. Any specific reason why to mark the
> not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
> suddenly stumble over it - and e.g., similarly kexec code when trying to
> find memory for placing kexec images. I think we should leave this
> !BUSY, just as it is right now.
> 
Agreed.

>>  	if (rc) {
>> -		release_resource(new_res);
>> -		kfree(new_res);
>> -		kfree(new_res_name);
>> +		release_mem_region(range.start, range_len(&range));
>> +		kfree(res_name);
>>  		return rc;
>>  	}
>> -	dev_dax->dax_kmem_res = new_res;
>> +
>> +	dev_set_drvdata(dev, res_name);
>>  
>>  	return 0;
>>  }
>>  
>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>> -static int dev_dax_kmem_remove(struct device *dev)
>> +static void dax_kmem_release(struct dev_dax *dev_dax)
>>  {
>> -	struct dev_dax *dev_dax = to_dev_dax(dev);
>> -	struct resource *res = dev_dax->dax_kmem_res;
>> -	resource_size_t kmem_start = res->start;
>> -	resource_size_t kmem_size = resource_size(res);
>> -	const char *res_name = res->name;
>>  	int rc;
>> +	struct device *dev = &dev_dax->dev;
>> +	const char *res_name = dev_get_drvdata(dev);
>> +	struct range range = dax_kmem_range(dev_dax);
>>  
>>  	/*
>>  	 * We have one shot for removing memory, if some memory blocks were not
>>  	 * offline prior to calling this function remove_memory() will fail, and
>>  	 * there is no way to hotremove this memory until reboot because device
>> -	 * unbind will succeed even if we return failure.
>> +	 * unbind will proceed regardless of the remove_memory result.
>>  	 */
>> -	rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
>> -	if (rc) {
>> -		any_hotremove_failed = true;
>> -		dev_err(dev,
>> -			"DAX region %pR cannot be hotremoved until the next reboot\n",
>> -			res);
>> -		return rc;
>> +	rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
>> +	if (rc == 0) {
> 
> if (!rc) ?
> 
Better off would be to keep the old order:

	if (rc) {
		any_hotremove_failed = true;
		dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
				range.start, range.end);
	        return;
	}

	release_mem_region(range.start, range_len(&range));
	dev_set_drvdata(dev, NULL);
	kfree(res_name);
	return;


>> +		release_mem_region(range.start, range_len(&range));
> 
> remove_memory() does a release_mem_region_adjustable(). Don't you
> actually want to release the *unaligned* region you requested?
> 
Isn't it what we're doing here?
(The release_mem_region_adjustable() is using the same
dax_kmem-aligned range and there's no split/adjust)

Meaning right now (+ parent marked as !BUSY), and if I am understanding
this correctly:

request_mem_region(range.start, range_len)
   __request_region(iomem_res, range.start, range_len) -> alloc @parent
add_memory_driver_managed(parent.start, resource_size(parent))
   __request_region(parent.start, resource_size(parent)) -> alloc @child

[...]

remove_memory(range.start, range_len)
 request_mem_region_adjustable(range.start, range_len)
  __release_region(range.start, range_len) -> remove @child

release_mem_region(range.start, range_len)
  __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?

The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.

	Joao
David Hildenbrand Sept. 8, 2020, 6:03 p.m. UTC | #3
>>> +		release_mem_region(range.start, range_len(&range));
>>
>> remove_memory() does a release_mem_region_adjustable(). Don't you
>> actually want to release the *unaligned* region you requested?
>>
> Isn't it what we're doing here?
> (The release_mem_region_adjustable() is using the same
> dax_kmem-aligned range and there's no split/adjust)

Oh, I think I was messing up things (there is just too much going on in
this patch).

Right, request_mem_region() and add_memory_driver_managed() are - and
were - called with the exact same range. That would have been clearer if
the patch would simply use range.start and range_len(&range) for both
calls (similar in the original code).

So, also the release calls have to use the same range. Agreed.

> 
> Meaning right now (+ parent marked as !BUSY), and if I am understanding
> this correctly:
> 
> request_mem_region(range.start, range_len)
>    __request_region(iomem_res, range.start, range_len) -> alloc @parent
> add_memory_driver_managed(parent.start, resource_size(parent))
>    __request_region(parent.start, resource_size(parent)) -> alloc @child
> 
> [...]
> 
> remove_memory(range.start, range_len)
>  request_mem_region_adjustable(range.start, range_len)
>   __release_region(range.start, range_len) -> remove @child
> 
> release_mem_region(range.start, range_len)
>   __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?
> 
> The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
> unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.

Interesting, I can only tell that virtio-mem expects that
remove_memory() won't remove the parent resource (which is !BUSY). So it
relies on the existing functionality.

I do wonder how walk_system_ram_range() behaves if both the parent and
the child are BUSY. Looking at it, I think it will detect the parent and
skip to the next range (without visiting the child) - which is not what
we want.

We could set the parent to BUSY just before doing the
release_mem_region() call, but that feels like a hack.

Maybe it's just easier to keep dax_kmem_res around ...
David Hildenbrand Sept. 23, 2020, 8:04 a.m. UTC | #4
On 08.09.20 17:33, Joao Martins wrote:
> [Sorry for the late response]
> 
> On 8/21/20 11:06 AM, David Hildenbrand wrote:
>> On 03.08.20 07:03, Dan Williams wrote:
>>> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
>>>  	 * could be mixed in a node with faster memory, causing
>>>  	 * unavoidable performance issues.
>>>  	 */
>>> -	numa_node = dev_dax->target_node;
>>>  	if (numa_node < 0) {
>>>  		dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
>>>  				numa_node);
>>>  		return -EINVAL;
>>>  	}
>>>  
>>> -	/* Hotplug starting at the beginning of the next block: */
>>> -	kmem_start = ALIGN(range->start, memory_block_size_bytes());
>>> -
>>> -	kmem_size = range_len(range);
>>> -	/* Adjust the size down to compensate for moving up kmem_start: */
>>> -	kmem_size -= kmem_start - range->start;
>>> -	/* Align the size down to cover only complete blocks: */
>>> -	kmem_size &= ~(memory_block_size_bytes() - 1);
>>> -	kmem_end = kmem_start + kmem_size;
>>> -
>>> -	new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>>> -	if (!new_res_name)
>>> +	res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>>> +	if (!res_name)
>>>  		return -ENOMEM;
>>>  
>>> -	/* Region is permanently reserved if hotremove fails. */
>>> -	new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
>>> -	if (!new_res) {
>>> -		dev_warn(dev, "could not reserve region [%pa-%pa]\n",
>>> -			 &kmem_start, &kmem_end);
>>> -		kfree(new_res_name);
>>> +	res = request_mem_region(range.start, range_len(&range), res_name);
>>
>> I think our range could be empty after aligning. I assume
>> request_mem_region() would check that, but maybe we could report a
>> better error/warning in that case.
>>
> dax_kmem_range() already returns a memory-block-aligned @range but
> IIUC request_mem_region() isn't checking for that. Having said that
> the returned @res wouldn't be different from the passed range.start.
> 
>>>  	/*
>>>  	 * Ensure that future kexec'd kernels will not treat this as RAM
>>>  	 * automatically.
>>>  	 */
>>> -	rc = add_memory_driver_managed(numa_node, new_res->start,
>>> -				       resource_size(new_res), kmem_name);
>>> +	rc = add_memory_driver_managed(numa_node, res->start,
>>> +				       resource_size(res), kmem_name);
>>> +
>>> +	res->flags |= IORESOURCE_BUSY;
>>
>> Hm, I don't think that's correct. Any specific reason why to mark the
>> not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
>> suddenly stumble over it - and e.g., similarly kexec code when trying to
>> find memory for placing kexec images. I think we should leave this
>> !BUSY, just as it is right now.
>>
> Agreed.
> 
>>>  	if (rc) {
>>> -		release_resource(new_res);
>>> -		kfree(new_res);
>>> -		kfree(new_res_name);
>>> +		release_mem_region(range.start, range_len(&range));
>>> +		kfree(res_name);
>>>  		return rc;
>>>  	}
>>> -	dev_dax->dax_kmem_res = new_res;
>>> +
>>> +	dev_set_drvdata(dev, res_name);
>>>  
>>>  	return 0;
>>>  }
>>>  
>>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>>> -static int dev_dax_kmem_remove(struct device *dev)
>>> +static void dax_kmem_release(struct dev_dax *dev_dax)
>>>  {
>>> -	struct dev_dax *dev_dax = to_dev_dax(dev);
>>> -	struct resource *res = dev_dax->dax_kmem_res;
>>> -	resource_size_t kmem_start = res->start;
>>> -	resource_size_t kmem_size = resource_size(res);
>>> -	const char *res_name = res->name;
>>>  	int rc;
>>> +	struct device *dev = &dev_dax->dev;
>>> +	const char *res_name = dev_get_drvdata(dev);
>>> +	struct range range = dax_kmem_range(dev_dax);
>>>  
>>>  	/*
>>>  	 * We have one shot for removing memory, if some memory blocks were not
>>>  	 * offline prior to calling this function remove_memory() will fail, and
>>>  	 * there is no way to hotremove this memory until reboot because device
>>> -	 * unbind will succeed even if we return failure.
>>> +	 * unbind will proceed regardless of the remove_memory result.
>>>  	 */
>>> -	rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
>>> -	if (rc) {
>>> -		any_hotremove_failed = true;
>>> -		dev_err(dev,
>>> -			"DAX region %pR cannot be hotremoved until the next reboot\n",
>>> -			res);
>>> -		return rc;
>>> +	rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
>>> +	if (rc == 0) {
>>
>> if (!rc) ?
>>
> Better off would be to keep the old order:
> 
> 	if (rc) {
> 		any_hotremove_failed = true;
> 		dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
> 				range.start, range.end);
> 	        return;
> 	}
> 
> 	release_mem_region(range.start, range_len(&range));
> 	dev_set_drvdata(dev, NULL);
> 	kfree(res_name);
> 	return;
> 
> 
>>> +		release_mem_region(range.start, range_len(&range));
>>
>> remove_memory() does a release_mem_region_adjustable(). Don't you
>> actually want to release the *unaligned* region you requested?
>>
> Isn't it what we're doing here?
> (The release_mem_region_adjustable() is using the same
> dax_kmem-aligned range and there's no split/adjust)
> 
> Meaning right now (+ parent marked as !BUSY), and if I am understanding
> this correctly:
> 
> request_mem_region(range.start, range_len)
>    __request_region(iomem_res, range.start, range_len) -> alloc @parent
> add_memory_driver_managed(parent.start, resource_size(parent))
>    __request_region(parent.start, resource_size(parent)) -> alloc @child
> 
> [...]
> 
> remove_memory(range.start, range_len)
>  request_mem_region_adjustable(range.start, range_len)
>   __release_region(range.start, range_len) -> remove @child
> 
> release_mem_region(range.start, range_len)
>   __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?
> 
> The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
> unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.
> 
> 	Joao
> 

Thinking about it, if we don't set the parent resource BUSY (which is
what I think is the right way of doing things), and don't want to store
the parent resource pointer, we could add something like
lookup_resource() - e.g., lookup_mem_resource() - , however, searching
properly in the whole hierarchy (instead of only the first level), and
traversing down to the last hierarchy. Then it would be as simple as

remove_memory(range.start, range_len)
res = lookup_mem_resource(range.start);
release_resource(res);
Dan Williams Sept. 23, 2020, 9:41 p.m. UTC | #5
On Wed, Sep 23, 2020 at 1:04 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 08.09.20 17:33, Joao Martins wrote:
> > [Sorry for the late response]
> >
> > On 8/21/20 11:06 AM, David Hildenbrand wrote:
> >> On 03.08.20 07:03, Dan Williams wrote:
> >>> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
> >>>      * could be mixed in a node with faster memory, causing
> >>>      * unavoidable performance issues.
> >>>      */
> >>> -   numa_node = dev_dax->target_node;
> >>>     if (numa_node < 0) {
> >>>             dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
> >>>                             numa_node);
> >>>             return -EINVAL;
> >>>     }
> >>>
> >>> -   /* Hotplug starting at the beginning of the next block: */
> >>> -   kmem_start = ALIGN(range->start, memory_block_size_bytes());
> >>> -
> >>> -   kmem_size = range_len(range);
> >>> -   /* Adjust the size down to compensate for moving up kmem_start: */
> >>> -   kmem_size -= kmem_start - range->start;
> >>> -   /* Align the size down to cover only complete blocks: */
> >>> -   kmem_size &= ~(memory_block_size_bytes() - 1);
> >>> -   kmem_end = kmem_start + kmem_size;
> >>> -
> >>> -   new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> >>> -   if (!new_res_name)
> >>> +   res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> >>> +   if (!res_name)
> >>>             return -ENOMEM;
> >>>
> >>> -   /* Region is permanently reserved if hotremove fails. */
> >>> -   new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
> >>> -   if (!new_res) {
> >>> -           dev_warn(dev, "could not reserve region [%pa-%pa]\n",
> >>> -                    &kmem_start, &kmem_end);
> >>> -           kfree(new_res_name);
> >>> +   res = request_mem_region(range.start, range_len(&range), res_name);
> >>
> >> I think our range could be empty after aligning. I assume
> >> request_mem_region() would check that, but maybe we could report a
> >> better error/warning in that case.
> >>
> > dax_kmem_range() already returns a memory-block-aligned @range but
> > IIUC request_mem_region() isn't checking for that. Having said that
> > the returned @res wouldn't be different from the passed range.start.
> >
> >>>     /*
> >>>      * Ensure that future kexec'd kernels will not treat this as RAM
> >>>      * automatically.
> >>>      */
> >>> -   rc = add_memory_driver_managed(numa_node, new_res->start,
> >>> -                                  resource_size(new_res), kmem_name);
> >>> +   rc = add_memory_driver_managed(numa_node, res->start,
> >>> +                                  resource_size(res), kmem_name);
> >>> +
> >>> +   res->flags |= IORESOURCE_BUSY;
> >>
> >> Hm, I don't think that's correct. Any specific reason why to mark the
> >> not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
> >> suddenly stumble over it - and e.g., similarly kexec code when trying to
> >> find memory for placing kexec images. I think we should leave this
> >> !BUSY, just as it is right now.
> >>
> > Agreed.
> >
> >>>     if (rc) {
> >>> -           release_resource(new_res);
> >>> -           kfree(new_res);
> >>> -           kfree(new_res_name);
> >>> +           release_mem_region(range.start, range_len(&range));
> >>> +           kfree(res_name);
> >>>             return rc;
> >>>     }
> >>> -   dev_dax->dax_kmem_res = new_res;
> >>> +
> >>> +   dev_set_drvdata(dev, res_name);
> >>>
> >>>     return 0;
> >>>  }
> >>>
> >>>  #ifdef CONFIG_MEMORY_HOTREMOVE
> >>> -static int dev_dax_kmem_remove(struct device *dev)
> >>> +static void dax_kmem_release(struct dev_dax *dev_dax)
> >>>  {
> >>> -   struct dev_dax *dev_dax = to_dev_dax(dev);
> >>> -   struct resource *res = dev_dax->dax_kmem_res;
> >>> -   resource_size_t kmem_start = res->start;
> >>> -   resource_size_t kmem_size = resource_size(res);
> >>> -   const char *res_name = res->name;
> >>>     int rc;
> >>> +   struct device *dev = &dev_dax->dev;
> >>> +   const char *res_name = dev_get_drvdata(dev);
> >>> +   struct range range = dax_kmem_range(dev_dax);
> >>>
> >>>     /*
> >>>      * We have one shot for removing memory, if some memory blocks were not
> >>>      * offline prior to calling this function remove_memory() will fail, and
> >>>      * there is no way to hotremove this memory until reboot because device
> >>> -    * unbind will succeed even if we return failure.
> >>> +    * unbind will proceed regardless of the remove_memory result.
> >>>      */
> >>> -   rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
> >>> -   if (rc) {
> >>> -           any_hotremove_failed = true;
> >>> -           dev_err(dev,
> >>> -                   "DAX region %pR cannot be hotremoved until the next reboot\n",
> >>> -                   res);
> >>> -           return rc;
> >>> +   rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
> >>> +   if (rc == 0) {
> >>
> >> if (!rc) ?
> >>
> > Better off would be to keep the old order:
> >
> >       if (rc) {
> >               any_hotremove_failed = true;
> >               dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
> >                               range.start, range.end);
> >               return;
> >       }
> >
> >       release_mem_region(range.start, range_len(&range));
> >       dev_set_drvdata(dev, NULL);
> >       kfree(res_name);
> >       return;
> >
> >
> >>> +           release_mem_region(range.start, range_len(&range));
> >>
> >> remove_memory() does a release_mem_region_adjustable(). Don't you
> >> actually want to release the *unaligned* region you requested?
> >>
> > Isn't it what we're doing here?
> > (The release_mem_region_adjustable() is using the same
> > dax_kmem-aligned range and there's no split/adjust)
> >
> > Meaning right now (+ parent marked as !BUSY), and if I am understanding
> > this correctly:
> >
> > request_mem_region(range.start, range_len)
> >    __request_region(iomem_res, range.start, range_len) -> alloc @parent
> > add_memory_driver_managed(parent.start, resource_size(parent))
> >    __request_region(parent.start, resource_size(parent)) -> alloc @child
> >
> > [...]
> >
> > remove_memory(range.start, range_len)
> >  request_mem_region_adjustable(range.start, range_len)
> >   __release_region(range.start, range_len) -> remove @child
> >
> > release_mem_region(range.start, range_len)
> >   __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?
> >
> > The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
> > unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.
> >
> >       Joao
> >
>
> Thinking about it, if we don't set the parent resource BUSY (which is
> what I think is the right way of doing things), and don't want to store
> the parent resource pointer, we could add something like
> lookup_resource() - e.g., lookup_mem_resource() - , however, searching
> properly in the whole hierarchy (instead of only the first level), and
> traversing down to the last hierarchy. Then it would be as simple as
>
> remove_memory(range.start, range_len)
> res = lookup_mem_resource(range.start);
> release_resource(res);

Another thought... I notice that you've taught
register_memory_resource() a IORESOURCE_MEM_DRIVER_MANAGED special
case. Lets just make the assumption of add_memory_driver_managed()
that it is the driver's responsibility to mark the range busy before
calling, and the driver's responsibility to release the region. I.e.
validate (rather than request) that the range is busy in
register_memory_resource(), and teach release_memory_resource() to
skip releasing the region when the memory is marked driver managed.
That would let dax_kmem drop its manipulation of the 'busy' flag which
is a layering violation no matter how many comments we put around it.
David Hildenbrand Sept. 24, 2020, 7:25 a.m. UTC | #6
On 23.09.20 23:41, Dan Williams wrote:
> On Wed, Sep 23, 2020 at 1:04 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 08.09.20 17:33, Joao Martins wrote:
>>> [Sorry for the late response]
>>>
>>> On 8/21/20 11:06 AM, David Hildenbrand wrote:
>>>> On 03.08.20 07:03, Dan Williams wrote:
>>>>> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
>>>>>      * could be mixed in a node with faster memory, causing
>>>>>      * unavoidable performance issues.
>>>>>      */
>>>>> -   numa_node = dev_dax->target_node;
>>>>>     if (numa_node < 0) {
>>>>>             dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
>>>>>                             numa_node);
>>>>>             return -EINVAL;
>>>>>     }
>>>>>
>>>>> -   /* Hotplug starting at the beginning of the next block: */
>>>>> -   kmem_start = ALIGN(range->start, memory_block_size_bytes());
>>>>> -
>>>>> -   kmem_size = range_len(range);
>>>>> -   /* Adjust the size down to compensate for moving up kmem_start: */
>>>>> -   kmem_size -= kmem_start - range->start;
>>>>> -   /* Align the size down to cover only complete blocks: */
>>>>> -   kmem_size &= ~(memory_block_size_bytes() - 1);
>>>>> -   kmem_end = kmem_start + kmem_size;
>>>>> -
>>>>> -   new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>>>>> -   if (!new_res_name)
>>>>> +   res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>>>>> +   if (!res_name)
>>>>>             return -ENOMEM;
>>>>>
>>>>> -   /* Region is permanently reserved if hotremove fails. */
>>>>> -   new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
>>>>> -   if (!new_res) {
>>>>> -           dev_warn(dev, "could not reserve region [%pa-%pa]\n",
>>>>> -                    &kmem_start, &kmem_end);
>>>>> -           kfree(new_res_name);
>>>>> +   res = request_mem_region(range.start, range_len(&range), res_name);
>>>>
>>>> I think our range could be empty after aligning. I assume
>>>> request_mem_region() would check that, but maybe we could report a
>>>> better error/warning in that case.
>>>>
>>> dax_kmem_range() already returns a memory-block-aligned @range but
>>> IIUC request_mem_region() isn't checking for that. Having said that
>>> the returned @res wouldn't be different from the passed range.start.
>>>
>>>>>     /*
>>>>>      * Ensure that future kexec'd kernels will not treat this as RAM
>>>>>      * automatically.
>>>>>      */
>>>>> -   rc = add_memory_driver_managed(numa_node, new_res->start,
>>>>> -                                  resource_size(new_res), kmem_name);
>>>>> +   rc = add_memory_driver_managed(numa_node, res->start,
>>>>> +                                  resource_size(res), kmem_name);
>>>>> +
>>>>> +   res->flags |= IORESOURCE_BUSY;
>>>>
>>>> Hm, I don't think that's correct. Any specific reason why to mark the
>>>> not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
>>>> suddenly stumble over it - and e.g., similarly kexec code when trying to
>>>> find memory for placing kexec images. I think we should leave this
>>>> !BUSY, just as it is right now.
>>>>
>>> Agreed.
>>>
>>>>>     if (rc) {
>>>>> -           release_resource(new_res);
>>>>> -           kfree(new_res);
>>>>> -           kfree(new_res_name);
>>>>> +           release_mem_region(range.start, range_len(&range));
>>>>> +           kfree(res_name);
>>>>>             return rc;
>>>>>     }
>>>>> -   dev_dax->dax_kmem_res = new_res;
>>>>> +
>>>>> +   dev_set_drvdata(dev, res_name);
>>>>>
>>>>>     return 0;
>>>>>  }
>>>>>
>>>>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>>>>> -static int dev_dax_kmem_remove(struct device *dev)
>>>>> +static void dax_kmem_release(struct dev_dax *dev_dax)
>>>>>  {
>>>>> -   struct dev_dax *dev_dax = to_dev_dax(dev);
>>>>> -   struct resource *res = dev_dax->dax_kmem_res;
>>>>> -   resource_size_t kmem_start = res->start;
>>>>> -   resource_size_t kmem_size = resource_size(res);
>>>>> -   const char *res_name = res->name;
>>>>>     int rc;
>>>>> +   struct device *dev = &dev_dax->dev;
>>>>> +   const char *res_name = dev_get_drvdata(dev);
>>>>> +   struct range range = dax_kmem_range(dev_dax);
>>>>>
>>>>>     /*
>>>>>      * We have one shot for removing memory, if some memory blocks were not
>>>>>      * offline prior to calling this function remove_memory() will fail, and
>>>>>      * there is no way to hotremove this memory until reboot because device
>>>>> -    * unbind will succeed even if we return failure.
>>>>> +    * unbind will proceed regardless of the remove_memory result.
>>>>>      */
>>>>> -   rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
>>>>> -   if (rc) {
>>>>> -           any_hotremove_failed = true;
>>>>> -           dev_err(dev,
>>>>> -                   "DAX region %pR cannot be hotremoved until the next reboot\n",
>>>>> -                   res);
>>>>> -           return rc;
>>>>> +   rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
>>>>> +   if (rc == 0) {
>>>>
>>>> if (!rc) ?
>>>>
>>> Better off would be to keep the old order:
>>>
>>>       if (rc) {
>>>               any_hotremove_failed = true;
>>>               dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
>>>                               range.start, range.end);
>>>               return;
>>>       }
>>>
>>>       release_mem_region(range.start, range_len(&range));
>>>       dev_set_drvdata(dev, NULL);
>>>       kfree(res_name);
>>>       return;
>>>
>>>
>>>>> +           release_mem_region(range.start, range_len(&range));
>>>>
>>>> remove_memory() does a release_mem_region_adjustable(). Don't you
>>>> actually want to release the *unaligned* region you requested?
>>>>
>>> Isn't it what we're doing here?
>>> (The release_mem_region_adjustable() is using the same
>>> dax_kmem-aligned range and there's no split/adjust)
>>>
>>> Meaning right now (+ parent marked as !BUSY), and if I am understanding
>>> this correctly:
>>>
>>> request_mem_region(range.start, range_len)
>>>    __request_region(iomem_res, range.start, range_len) -> alloc @parent
>>> add_memory_driver_managed(parent.start, resource_size(parent))
>>>    __request_region(parent.start, resource_size(parent)) -> alloc @child
>>>
>>> [...]
>>>
>>> remove_memory(range.start, range_len)
>>>  request_mem_region_adjustable(range.start, range_len)
>>>   __release_region(range.start, range_len) -> remove @child
>>>
>>> release_mem_region(range.start, range_len)
>>>   __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?
>>>
>>> The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
>>> unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.
>>>
>>>       Joao
>>>
>>
>> Thinking about it, if we don't set the parent resource BUSY (which is
>> what I think is the right way of doing things), and don't want to store
>> the parent resource pointer, we could add something like
>> lookup_resource() - e.g., lookup_mem_resource() - , however, searching
>> properly in the whole hierarchy (instead of only the first level), and
>> traversing down to the last hierarchy. Then it would be as simple as
>>
>> remove_memory(range.start, range_len)
>> res = lookup_mem_resource(range.start);
>> release_resource(res);
> 
> Another thought... I notice that you've taught
> register_memory_resource() a IORESOURCE_MEM_DRIVER_MANAGED special
> case. Lets just make the assumption of add_memory_driver_managed()
> that it is the driver's responsibility to mark the range busy before
> calling, and the driver's responsibility to release the region. I.e.
> validate (rather than request) that the range is busy in
> register_memory_resource(), and teach release_memory_resource() to
> skip releasing the region when the memory is marked driver managed.
> That would let dax_kmem drop its manipulation of the 'busy' flag which
> is a layering violation no matter how many comments we put around it.

IIUC, that won't work for virtio-mem, whereby the parent resource spans
multiple possible (future) add_memory_driver_managed() calls and is
(just like for kmem) a pure indication to which device memory ranges belong.

For example, when exposing 2GB via a virtio-mem device with max 4GB:

(/proc/iomem)
240000000-33fffffff : virtio0
  240000000-2bfffffff : System RAM (virtio_mem)

And after hotplugging additional 2GB:

240000000-33fffffff : virtio0
  240000000-33fffffff : System RAM (virtio_mem)

So marking "virtio0" always BUSY (especially right from the start) would
be wrong. The assumption is that anything that's IORESOURCE_SYSTEM_RAM
and IORESOUCE_BUSY is currently added to the system as system RAM (e.g.,
after add_memory() and friends, or during boot).

I do agree that manually clearing the busy flag is ugly. What we most
probably want is request_mem_region() that performs similar checks (no
overlaps with existing BUSY resources), but doesn't set the region busy.
Dan Williams Sept. 24, 2020, 1:54 p.m. UTC | #7
On Thu, Sep 24, 2020 at 12:26 AM David Hildenbrand <david@redhat.com> wrote:
>
> On 23.09.20 23:41, Dan Williams wrote:
> > On Wed, Sep 23, 2020 at 1:04 AM David Hildenbrand <david@redhat.com> wrote:
> >>
> >> On 08.09.20 17:33, Joao Martins wrote:
> >>> [Sorry for the late response]
> >>>
> >>> On 8/21/20 11:06 AM, David Hildenbrand wrote:
> >>>> On 03.08.20 07:03, Dan Williams wrote:
> >>>>> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
> >>>>>      * could be mixed in a node with faster memory, causing
> >>>>>      * unavoidable performance issues.
> >>>>>      */
> >>>>> -   numa_node = dev_dax->target_node;
> >>>>>     if (numa_node < 0) {
> >>>>>             dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
> >>>>>                             numa_node);
> >>>>>             return -EINVAL;
> >>>>>     }
> >>>>>
> >>>>> -   /* Hotplug starting at the beginning of the next block: */
> >>>>> -   kmem_start = ALIGN(range->start, memory_block_size_bytes());
> >>>>> -
> >>>>> -   kmem_size = range_len(range);
> >>>>> -   /* Adjust the size down to compensate for moving up kmem_start: */
> >>>>> -   kmem_size -= kmem_start - range->start;
> >>>>> -   /* Align the size down to cover only complete blocks: */
> >>>>> -   kmem_size &= ~(memory_block_size_bytes() - 1);
> >>>>> -   kmem_end = kmem_start + kmem_size;
> >>>>> -
> >>>>> -   new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> >>>>> -   if (!new_res_name)
> >>>>> +   res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> >>>>> +   if (!res_name)
> >>>>>             return -ENOMEM;
> >>>>>
> >>>>> -   /* Region is permanently reserved if hotremove fails. */
> >>>>> -   new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
> >>>>> -   if (!new_res) {
> >>>>> -           dev_warn(dev, "could not reserve region [%pa-%pa]\n",
> >>>>> -                    &kmem_start, &kmem_end);
> >>>>> -           kfree(new_res_name);
> >>>>> +   res = request_mem_region(range.start, range_len(&range), res_name);
> >>>>
> >>>> I think our range could be empty after aligning. I assume
> >>>> request_mem_region() would check that, but maybe we could report a
> >>>> better error/warning in that case.
> >>>>
> >>> dax_kmem_range() already returns a memory-block-aligned @range but
> >>> IIUC request_mem_region() isn't checking for that. Having said that
> >>> the returned @res wouldn't be different from the passed range.start.
> >>>
> >>>>>     /*
> >>>>>      * Ensure that future kexec'd kernels will not treat this as RAM
> >>>>>      * automatically.
> >>>>>      */
> >>>>> -   rc = add_memory_driver_managed(numa_node, new_res->start,
> >>>>> -                                  resource_size(new_res), kmem_name);
> >>>>> +   rc = add_memory_driver_managed(numa_node, res->start,
> >>>>> +                                  resource_size(res), kmem_name);
> >>>>> +
> >>>>> +   res->flags |= IORESOURCE_BUSY;
> >>>>
> >>>> Hm, I don't think that's correct. Any specific reason why to mark the
> >>>> not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
> >>>> suddenly stumble over it - and e.g., similarly kexec code when trying to
> >>>> find memory for placing kexec images. I think we should leave this
> >>>> !BUSY, just as it is right now.
> >>>>
> >>> Agreed.
> >>>
> >>>>>     if (rc) {
> >>>>> -           release_resource(new_res);
> >>>>> -           kfree(new_res);
> >>>>> -           kfree(new_res_name);
> >>>>> +           release_mem_region(range.start, range_len(&range));
> >>>>> +           kfree(res_name);
> >>>>>             return rc;
> >>>>>     }
> >>>>> -   dev_dax->dax_kmem_res = new_res;
> >>>>> +
> >>>>> +   dev_set_drvdata(dev, res_name);
> >>>>>
> >>>>>     return 0;
> >>>>>  }
> >>>>>
> >>>>>  #ifdef CONFIG_MEMORY_HOTREMOVE
> >>>>> -static int dev_dax_kmem_remove(struct device *dev)
> >>>>> +static void dax_kmem_release(struct dev_dax *dev_dax)
> >>>>>  {
> >>>>> -   struct dev_dax *dev_dax = to_dev_dax(dev);
> >>>>> -   struct resource *res = dev_dax->dax_kmem_res;
> >>>>> -   resource_size_t kmem_start = res->start;
> >>>>> -   resource_size_t kmem_size = resource_size(res);
> >>>>> -   const char *res_name = res->name;
> >>>>>     int rc;
> >>>>> +   struct device *dev = &dev_dax->dev;
> >>>>> +   const char *res_name = dev_get_drvdata(dev);
> >>>>> +   struct range range = dax_kmem_range(dev_dax);
> >>>>>
> >>>>>     /*
> >>>>>      * We have one shot for removing memory, if some memory blocks were not
> >>>>>      * offline prior to calling this function remove_memory() will fail, and
> >>>>>      * there is no way to hotremove this memory until reboot because device
> >>>>> -    * unbind will succeed even if we return failure.
> >>>>> +    * unbind will proceed regardless of the remove_memory result.
> >>>>>      */
> >>>>> -   rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
> >>>>> -   if (rc) {
> >>>>> -           any_hotremove_failed = true;
> >>>>> -           dev_err(dev,
> >>>>> -                   "DAX region %pR cannot be hotremoved until the next reboot\n",
> >>>>> -                   res);
> >>>>> -           return rc;
> >>>>> +   rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
> >>>>> +   if (rc == 0) {
> >>>>
> >>>> if (!rc) ?
> >>>>
> >>> Better off would be to keep the old order:
> >>>
> >>>       if (rc) {
> >>>               any_hotremove_failed = true;
> >>>               dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
> >>>                               range.start, range.end);
> >>>               return;
> >>>       }
> >>>
> >>>       release_mem_region(range.start, range_len(&range));
> >>>       dev_set_drvdata(dev, NULL);
> >>>       kfree(res_name);
> >>>       return;
> >>>
> >>>
> >>>>> +           release_mem_region(range.start, range_len(&range));
> >>>>
> >>>> remove_memory() does a release_mem_region_adjustable(). Don't you
> >>>> actually want to release the *unaligned* region you requested?
> >>>>
> >>> Isn't it what we're doing here?
> >>> (The release_mem_region_adjustable() is using the same
> >>> dax_kmem-aligned range and there's no split/adjust)
> >>>
> >>> Meaning right now (+ parent marked as !BUSY), and if I am understanding
> >>> this correctly:
> >>>
> >>> request_mem_region(range.start, range_len)
> >>>    __request_region(iomem_res, range.start, range_len) -> alloc @parent
> >>> add_memory_driver_managed(parent.start, resource_size(parent))
> >>>    __request_region(parent.start, resource_size(parent)) -> alloc @child
> >>>
> >>> [...]
> >>>
> >>> remove_memory(range.start, range_len)
> >>>  request_mem_region_adjustable(range.start, range_len)
> >>>   __release_region(range.start, range_len) -> remove @child
> >>>
> >>> release_mem_region(range.start, range_len)
> >>>   __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?
> >>>
> >>> The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
> >>> unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.
> >>>
> >>>       Joao
> >>>
> >>
> >> Thinking about it, if we don't set the parent resource BUSY (which is
> >> what I think is the right way of doing things), and don't want to store
> >> the parent resource pointer, we could add something like
> >> lookup_resource() - e.g., lookup_mem_resource() - , however, searching
> >> properly in the whole hierarchy (instead of only the first level), and
> >> traversing down to the last hierarchy. Then it would be as simple as
> >>
> >> remove_memory(range.start, range_len)
> >> res = lookup_mem_resource(range.start);
> >> release_resource(res);
> >
> > Another thought... I notice that you've taught
> > register_memory_resource() a IORESOURCE_MEM_DRIVER_MANAGED special
> > case. Lets just make the assumption of add_memory_driver_managed()
> > that it is the driver's responsibility to mark the range busy before
> > calling, and the driver's responsibility to release the region. I.e.
> > validate (rather than request) that the range is busy in
> > register_memory_resource(), and teach release_memory_resource() to
> > skip releasing the region when the memory is marked driver managed.
> > That would let dax_kmem drop its manipulation of the 'busy' flag which
> > is a layering violation no matter how many comments we put around it.
>
> IIUC, that won't work for virtio-mem, whereby the parent resource spans
> multiple possible (future) add_memory_driver_managed() calls and is
> (just like for kmem) a pure indication to which device memory ranges belong.
>
> For example, when exposing 2GB via a virtio-mem device with max 4GB:
>
> (/proc/iomem)
> 240000000-33fffffff : virtio0
>   240000000-2bfffffff : System RAM (virtio_mem)
>
> And after hotplugging additional 2GB:
>
> 240000000-33fffffff : virtio0
>   240000000-33fffffff : System RAM (virtio_mem)
>
> So marking "virtio0" always BUSY (especially right from the start) would
> be wrong.

I'm not suggesting to busy the whole "virtio" range, just the portion
that's about to be passed to add_memory_driver_managed().

> The assumption is that anything that's IORESOURCE_SYSTEM_RAM
> and IORESOUCE_BUSY is currently added to the system as system RAM (e.g.,
> after add_memory() and friends, or during boot).
>
> I do agree that manually clearing the busy flag is ugly. What we most
> probably want is request_mem_region() that performs similar checks (no
> overlaps with existing BUSY resources), but doesn't set the region busy.
>

I can't see that working without some way to export and hold the
resource lock until some agent can atomically claim the range.
David Hildenbrand Sept. 24, 2020, 6:12 p.m. UTC | #8
On 24.09.20 15:54, Dan Williams wrote:
> On Thu, Sep 24, 2020 at 12:26 AM David Hildenbrand <david@redhat.com> wrote:
>>
>> On 23.09.20 23:41, Dan Williams wrote:
>>> On Wed, Sep 23, 2020 at 1:04 AM David Hildenbrand <david@redhat.com> wrote:
>>>>
>>>> On 08.09.20 17:33, Joao Martins wrote:
>>>>> [Sorry for the late response]
>>>>>
>>>>> On 8/21/20 11:06 AM, David Hildenbrand wrote:
>>>>>> On 03.08.20 07:03, Dan Williams wrote:
>>>>>>> @@ -37,109 +45,94 @@ int dev_dax_kmem_probe(struct device *dev)
>>>>>>>      * could be mixed in a node with faster memory, causing
>>>>>>>      * unavoidable performance issues.
>>>>>>>      */
>>>>>>> -   numa_node = dev_dax->target_node;
>>>>>>>     if (numa_node < 0) {
>>>>>>>             dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
>>>>>>>                             numa_node);
>>>>>>>             return -EINVAL;
>>>>>>>     }
>>>>>>>
>>>>>>> -   /* Hotplug starting at the beginning of the next block: */
>>>>>>> -   kmem_start = ALIGN(range->start, memory_block_size_bytes());
>>>>>>> -
>>>>>>> -   kmem_size = range_len(range);
>>>>>>> -   /* Adjust the size down to compensate for moving up kmem_start: */
>>>>>>> -   kmem_size -= kmem_start - range->start;
>>>>>>> -   /* Align the size down to cover only complete blocks: */
>>>>>>> -   kmem_size &= ~(memory_block_size_bytes() - 1);
>>>>>>> -   kmem_end = kmem_start + kmem_size;
>>>>>>> -
>>>>>>> -   new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>>>>>>> -   if (!new_res_name)
>>>>>>> +   res_name = kstrdup(dev_name(dev), GFP_KERNEL);
>>>>>>> +   if (!res_name)
>>>>>>>             return -ENOMEM;
>>>>>>>
>>>>>>> -   /* Region is permanently reserved if hotremove fails. */
>>>>>>> -   new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
>>>>>>> -   if (!new_res) {
>>>>>>> -           dev_warn(dev, "could not reserve region [%pa-%pa]\n",
>>>>>>> -                    &kmem_start, &kmem_end);
>>>>>>> -           kfree(new_res_name);
>>>>>>> +   res = request_mem_region(range.start, range_len(&range), res_name);
>>>>>>
>>>>>> I think our range could be empty after aligning. I assume
>>>>>> request_mem_region() would check that, but maybe we could report a
>>>>>> better error/warning in that case.
>>>>>>
>>>>> dax_kmem_range() already returns a memory-block-aligned @range but
>>>>> IIUC request_mem_region() isn't checking for that. Having said that
>>>>> the returned @res wouldn't be different from the passed range.start.
>>>>>
>>>>>>>     /*
>>>>>>>      * Ensure that future kexec'd kernels will not treat this as RAM
>>>>>>>      * automatically.
>>>>>>>      */
>>>>>>> -   rc = add_memory_driver_managed(numa_node, new_res->start,
>>>>>>> -                                  resource_size(new_res), kmem_name);
>>>>>>> +   rc = add_memory_driver_managed(numa_node, res->start,
>>>>>>> +                                  resource_size(res), kmem_name);
>>>>>>> +
>>>>>>> +   res->flags |= IORESOURCE_BUSY;
>>>>>>
>>>>>> Hm, I don't think that's correct. Any specific reason why to mark the
>>>>>> not-added, unaligned parts BUSY? E.g., walk_system_ram_range() could
>>>>>> suddenly stumble over it - and e.g., similarly kexec code when trying to
>>>>>> find memory for placing kexec images. I think we should leave this
>>>>>> !BUSY, just as it is right now.
>>>>>>
>>>>> Agreed.
>>>>>
>>>>>>>     if (rc) {
>>>>>>> -           release_resource(new_res);
>>>>>>> -           kfree(new_res);
>>>>>>> -           kfree(new_res_name);
>>>>>>> +           release_mem_region(range.start, range_len(&range));
>>>>>>> +           kfree(res_name);
>>>>>>>             return rc;
>>>>>>>     }
>>>>>>> -   dev_dax->dax_kmem_res = new_res;
>>>>>>> +
>>>>>>> +   dev_set_drvdata(dev, res_name);
>>>>>>>
>>>>>>>     return 0;
>>>>>>>  }
>>>>>>>
>>>>>>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>>>>>>> -static int dev_dax_kmem_remove(struct device *dev)
>>>>>>> +static void dax_kmem_release(struct dev_dax *dev_dax)
>>>>>>>  {
>>>>>>> -   struct dev_dax *dev_dax = to_dev_dax(dev);
>>>>>>> -   struct resource *res = dev_dax->dax_kmem_res;
>>>>>>> -   resource_size_t kmem_start = res->start;
>>>>>>> -   resource_size_t kmem_size = resource_size(res);
>>>>>>> -   const char *res_name = res->name;
>>>>>>>     int rc;
>>>>>>> +   struct device *dev = &dev_dax->dev;
>>>>>>> +   const char *res_name = dev_get_drvdata(dev);
>>>>>>> +   struct range range = dax_kmem_range(dev_dax);
>>>>>>>
>>>>>>>     /*
>>>>>>>      * We have one shot for removing memory, if some memory blocks were not
>>>>>>>      * offline prior to calling this function remove_memory() will fail, and
>>>>>>>      * there is no way to hotremove this memory until reboot because device
>>>>>>> -    * unbind will succeed even if we return failure.
>>>>>>> +    * unbind will proceed regardless of the remove_memory result.
>>>>>>>      */
>>>>>>> -   rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
>>>>>>> -   if (rc) {
>>>>>>> -           any_hotremove_failed = true;
>>>>>>> -           dev_err(dev,
>>>>>>> -                   "DAX region %pR cannot be hotremoved until the next reboot\n",
>>>>>>> -                   res);
>>>>>>> -           return rc;
>>>>>>> +   rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
>>>>>>> +   if (rc == 0) {
>>>>>>
>>>>>> if (!rc) ?
>>>>>>
>>>>> Better off would be to keep the old order:
>>>>>
>>>>>       if (rc) {
>>>>>               any_hotremove_failed = true;
>>>>>               dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
>>>>>                               range.start, range.end);
>>>>>               return;
>>>>>       }
>>>>>
>>>>>       release_mem_region(range.start, range_len(&range));
>>>>>       dev_set_drvdata(dev, NULL);
>>>>>       kfree(res_name);
>>>>>       return;
>>>>>
>>>>>
>>>>>>> +           release_mem_region(range.start, range_len(&range));
>>>>>>
>>>>>> remove_memory() does a release_mem_region_adjustable(). Don't you
>>>>>> actually want to release the *unaligned* region you requested?
>>>>>>
>>>>> Isn't it what we're doing here?
>>>>> (The release_mem_region_adjustable() is using the same
>>>>> dax_kmem-aligned range and there's no split/adjust)
>>>>>
>>>>> Meaning right now (+ parent marked as !BUSY), and if I am understanding
>>>>> this correctly:
>>>>>
>>>>> request_mem_region(range.start, range_len)
>>>>>    __request_region(iomem_res, range.start, range_len) -> alloc @parent
>>>>> add_memory_driver_managed(parent.start, resource_size(parent))
>>>>>    __request_region(parent.start, resource_size(parent)) -> alloc @child
>>>>>
>>>>> [...]
>>>>>
>>>>> remove_memory(range.start, range_len)
>>>>>  request_mem_region_adjustable(range.start, range_len)
>>>>>   __release_region(range.start, range_len) -> remove @child
>>>>>
>>>>> release_mem_region(range.start, range_len)
>>>>>   __release_region(range.start, range_len) -> doesn't remove @parent because !BUSY?
>>>>>
>>>>> The add/removal of this relies on !BUSY. But now I am wondering if the parent remaining
>>>>> unreleased is deliberate even on CONFIG_MEMORY_HOTREMOVE=y.
>>>>>
>>>>>       Joao
>>>>>
>>>>
>>>> Thinking about it, if we don't set the parent resource BUSY (which is
>>>> what I think is the right way of doing things), and don't want to store
>>>> the parent resource pointer, we could add something like
>>>> lookup_resource() - e.g., lookup_mem_resource() - , however, searching
>>>> properly in the whole hierarchy (instead of only the first level), and
>>>> traversing down to the last hierarchy. Then it would be as simple as
>>>>
>>>> remove_memory(range.start, range_len)
>>>> res = lookup_mem_resource(range.start);
>>>> release_resource(res);
>>>
>>> Another thought... I notice that you've taught
>>> register_memory_resource() a IORESOURCE_MEM_DRIVER_MANAGED special
>>> case. Lets just make the assumption of add_memory_driver_managed()
>>> that it is the driver's responsibility to mark the range busy before
>>> calling, and the driver's responsibility to release the region. I.e.
>>> validate (rather than request) that the range is busy in
>>> register_memory_resource(), and teach release_memory_resource() to
>>> skip releasing the region when the memory is marked driver managed.
>>> That would let dax_kmem drop its manipulation of the 'busy' flag which
>>> is a layering violation no matter how many comments we put around it.
>>
>> IIUC, that won't work for virtio-mem, whereby the parent resource spans
>> multiple possible (future) add_memory_driver_managed() calls and is
>> (just like for kmem) a pure indication to which device memory ranges belong.
>>
>> For example, when exposing 2GB via a virtio-mem device with max 4GB:
>>
>> (/proc/iomem)
>> 240000000-33fffffff : virtio0
>>   240000000-2bfffffff : System RAM (virtio_mem)
>>
>> And after hotplugging additional 2GB:
>>
>> 240000000-33fffffff : virtio0
>>   240000000-33fffffff : System RAM (virtio_mem)
>>
>> So marking "virtio0" always BUSY (especially right from the start) would
>> be wrong.
> 
> I'm not suggesting to busy the whole "virtio" range, just the portion
> that's about to be passed to add_memory_driver_managed().

I'm afraid I don't get your point. For virtio-mem:

Before:

1. Create virtio0 container resource

2. (somewhen in the future) add_memory_driver_managed()
 - Create resource (System RAM (virtio_mem)), marking it busy/driver
   managed

After:

1. Create virtio0 container resource

2. (somewhen in the future) Create resource (System RAM (virtio_mem)),
   marking it busy/driver managed
3. add_memory_driver_managed()

Not helpful or simpler IMHO.

> 
>> The assumption is that anything that's IORESOURCE_SYSTEM_RAM
>> and IORESOUCE_BUSY is currently added to the system as system RAM (e.g.,
>> after add_memory() and friends, or during boot).
>>
>> I do agree that manually clearing the busy flag is ugly. What we most
>> probably want is request_mem_region() that performs similar checks (no
>> overlaps with existing BUSY resources), but doesn't set the region busy.
>>
> 
> I can't see that working without some way to export and hold the
> resource lock until some agent can atomically claim the range.

I don't think we have to care about races here. The "BUSY" checks is
really just a check for leftovers, e.g., after kexec or after driver
reloading. If somebody else would try to concurrently add System RAM
/something else within the range of your device, something else, very
weird, would be going on (let's call it a BUG, just like if somebody
would be removing system RAM in your device range ...).

For example, in case of virtio-mem, when you unload the driver, it
cannot remove the "virtio0" resource in case some system ram in the
range is still plugged (busy).

So when reloading the driver, it would try to re-create the virtio0
resource, detect that some system ram in the range is still BUSY, and
fail gracefully. This is how it works and how it's expected to work - at
least for virtio-mem.

I assume something similar can be observed with kmem, when trying to
reload the driver or similar - but races shouldn't be relevant here.
Dan Williams Sept. 24, 2020, 9:26 p.m. UTC | #9
[..]
> > I'm not suggesting to busy the whole "virtio" range, just the portion
> > that's about to be passed to add_memory_driver_managed().
>
> I'm afraid I don't get your point. For virtio-mem:
>
> Before:
>
> 1. Create virtio0 container resource
>
> 2. (somewhen in the future) add_memory_driver_managed()
>  - Create resource (System RAM (virtio_mem)), marking it busy/driver
>    managed
>
> After:
>
> 1. Create virtio0 container resource
>
> 2. (somewhen in the future) Create resource (System RAM (virtio_mem)),
>    marking it busy/driver managed
> 3. add_memory_driver_managed()
>
> Not helpful or simpler IMHO.

The concern I'm trying to address is the theoretical race window and
layering violation in this sequence in the kmem driver:

1/ res = request_mem_region(...);
2/ res->flags = IORESOURCE_MEM;
3/ add_memory_driver_managed();

Between 2/ and 3/ something can race and think that it owns the
region. Do I think it will happen in practice, no, but it's still a
pattern that deserves come cleanup.
David Hildenbrand Sept. 24, 2020, 9:41 p.m. UTC | #10
> Am 24.09.2020 um 23:26 schrieb Dan Williams <dan.j.williams@intel.com>:
> 
> [..]
>>> I'm not suggesting to busy the whole "virtio" range, just the portion
>>> that's about to be passed to add_memory_driver_managed().
>> 
>> I'm afraid I don't get your point. For virtio-mem:
>> 
>> Before:
>> 
>> 1. Create virtio0 container resource
>> 
>> 2. (somewhen in the future) add_memory_driver_managed()
>> - Create resource (System RAM (virtio_mem)), marking it busy/driver
>>   managed
>> 
>> After:
>> 
>> 1. Create virtio0 container resource
>> 
>> 2. (somewhen in the future) Create resource (System RAM (virtio_mem)),
>>   marking it busy/driver managed
>> 3. add_memory_driver_managed()
>> 
>> Not helpful or simpler IMHO.
> 
> The concern I'm trying to address is the theoretical race window and
> layering violation in this sequence in the kmem driver:
> 
> 1/ res = request_mem_region(...);
> 2/ res->flags = IORESOURCE_MEM;
> 3/ add_memory_driver_managed();
> 
> Between 2/ and 3/ something can race and think that it owns the
> region. Do I think it will happen in practice, no, but it's still a
> pattern that deserves come cleanup.

I think in that unlikely event (rather impossible), add_memory_driver_managed() should fail, detecting a conflicting (busy) resource. Not sure what will happen next ( and did not double-check).

But yeah, the way the BUSY bit is cleared here is wrong - simply overwriting other bits. And it would be even better if we could avoid manually messing with flags here.
>
Dan Williams Sept. 24, 2020, 9:50 p.m. UTC | #11
On Thu, Sep 24, 2020 at 2:42 PM David Hildenbrand <david@redhat.com> wrote:
>
>
>
> > Am 24.09.2020 um 23:26 schrieb Dan Williams <dan.j.williams@intel.com>:
> >
> > [..]
> >>> I'm not suggesting to busy the whole "virtio" range, just the portion
> >>> that's about to be passed to add_memory_driver_managed().
> >>
> >> I'm afraid I don't get your point. For virtio-mem:
> >>
> >> Before:
> >>
> >> 1. Create virtio0 container resource
> >>
> >> 2. (somewhen in the future) add_memory_driver_managed()
> >> - Create resource (System RAM (virtio_mem)), marking it busy/driver
> >>   managed
> >>
> >> After:
> >>
> >> 1. Create virtio0 container resource
> >>
> >> 2. (somewhen in the future) Create resource (System RAM (virtio_mem)),
> >>   marking it busy/driver managed
> >> 3. add_memory_driver_managed()
> >>
> >> Not helpful or simpler IMHO.
> >
> > The concern I'm trying to address is the theoretical race window and
> > layering violation in this sequence in the kmem driver:
> >
> > 1/ res = request_mem_region(...);
> > 2/ res->flags = IORESOURCE_MEM;
> > 3/ add_memory_driver_managed();
> >
> > Between 2/ and 3/ something can race and think that it owns the
> > region. Do I think it will happen in practice, no, but it's still a
> > pattern that deserves come cleanup.
>
> I think in that unlikely event (rather impossible), add_memory_driver_managed() should fail, detecting a conflicting (busy) resource. Not sure what will happen next ( and did not double-check).

add_memory_driver_managed() will fail, but the release_mem_region() in
kmem to unwind on the error path will do the wrong thing because that
other driver thinks it got ownership of the region.

> But yeah, the way the BUSY bit is cleared here is wrong - simply overwriting other bits. And it would be even better if we could avoid manually messing with flags here.

I'm ok to leave it alone for now (hasn't been and likely never will be
a problem in practice), but I think it was still worth grumbling
about. I'll leave that part of kmem alone in the upcoming split of
dax_kmem_res removal.
David Hildenbrand Sept. 25, 2020, 8:54 a.m. UTC | #12
On 24.09.20 23:50, Dan Williams wrote:
> On Thu, Sep 24, 2020 at 2:42 PM David Hildenbrand <david@redhat.com> wrote:
>>
>>
>>
>>> Am 24.09.2020 um 23:26 schrieb Dan Williams <dan.j.williams@intel.com>:
>>>
>>> [..]
>>>>> I'm not suggesting to busy the whole "virtio" range, just the portion
>>>>> that's about to be passed to add_memory_driver_managed().
>>>>
>>>> I'm afraid I don't get your point. For virtio-mem:
>>>>
>>>> Before:
>>>>
>>>> 1. Create virtio0 container resource
>>>>
>>>> 2. (somewhen in the future) add_memory_driver_managed()
>>>> - Create resource (System RAM (virtio_mem)), marking it busy/driver
>>>>   managed
>>>>
>>>> After:
>>>>
>>>> 1. Create virtio0 container resource
>>>>
>>>> 2. (somewhen in the future) Create resource (System RAM (virtio_mem)),
>>>>   marking it busy/driver managed
>>>> 3. add_memory_driver_managed()
>>>>
>>>> Not helpful or simpler IMHO.
>>>
>>> The concern I'm trying to address is the theoretical race window and
>>> layering violation in this sequence in the kmem driver:
>>>
>>> 1/ res = request_mem_region(...);
>>> 2/ res->flags = IORESOURCE_MEM;
>>> 3/ add_memory_driver_managed();
>>>
>>> Between 2/ and 3/ something can race and think that it owns the
>>> region. Do I think it will happen in practice, no, but it's still a
>>> pattern that deserves come cleanup.
>>
>> I think in that unlikely event (rather impossible), add_memory_driver_managed() should fail, detecting a conflicting (busy) resource. Not sure what will happen next ( and did not double-check).
> 
> add_memory_driver_managed() will fail, but the release_mem_region() in
> kmem to unwind on the error path will do the wrong thing because that
> other driver thinks it got ownership of the region.
> 

I think if somebody would race and claim the region for itself (after we
unchecked the BUSY flag), there would be another memory resource below
our resource container (e.g., via __request_region()).

So, interestingly, the current code will do a

release_resource->__release_resource(old, true);

which will remove whatever somebody added below the resource.

If we were to do a

remove_resource->__release_resource(old, false);

we would only remove what we temporarily added, relocating anychildren
(someone nasty added).

But yeah, I don't think we have to worry about this case.

>> But yeah, the way the BUSY bit is cleared here is wrong - simply overwriting other bits. And it would be even better if we could avoid manually messing with flags here.
> 
> I'm ok to leave it alone for now (hasn't been and likely never will be
> a problem in practice), but I think it was still worth grumbling

Definitely, it gives us a better understanding.

> about. I'll leave that part of kmem alone in the upcoming split of
> dax_kmem_res removal.

Yeah, stuff is more complicated than I would wished, so I guess it's
better to leave it alone for now until we actually see issues with
somebody else regarding *our* device-owned region (or we're able to come
up with a cleanup that keeps all corner cases working for kmem and
virtio-mem).
diff mbox series

Patch

diff --git a/drivers/dax/dax-private.h b/drivers/dax/dax-private.h
index 6779f683671d..12a2dbc43b40 100644
--- a/drivers/dax/dax-private.h
+++ b/drivers/dax/dax-private.h
@@ -42,8 +42,6 @@  struct dax_region {
  * @dev - device core
  * @pgmap - pgmap for memmap setup / lifetime (driver owned)
  * @range: resource range for the instance
- * @dax_mem_res: physical address range of hotadded DAX memory
- * @dax_mem_name: name for hotadded DAX memory via add_memory_driver_managed()
  */
 struct dev_dax {
 	struct dax_region *region;
@@ -52,7 +50,6 @@  struct dev_dax {
 	struct device dev;
 	struct dev_pagemap *pgmap;
 	struct range range;
-	struct resource *dax_kmem_res;
 };
 
 static inline u64 range_len(struct range *range)
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 5bb133df147d..77e25361fbeb 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -19,16 +19,24 @@  static const char *kmem_name;
 /* Set if any memory will remain added when the driver will be unloaded. */
 static bool any_hotremove_failed;
 
+static struct range dax_kmem_range(struct dev_dax *dev_dax)
+{
+	struct range range;
+
+	/* memory-block align the hotplug range */
+	range.start = ALIGN(dev_dax->range.start, memory_block_size_bytes());
+	range.end = ALIGN_DOWN(dev_dax->range.end + 1,
+			memory_block_size_bytes()) - 1;
+	return range;
+}
+
 int dev_dax_kmem_probe(struct device *dev)
 {
 	struct dev_dax *dev_dax = to_dev_dax(dev);
-	struct range *range = &dev_dax->range;
-	resource_size_t kmem_start;
-	resource_size_t kmem_size;
-	resource_size_t kmem_end;
-	struct resource *new_res;
-	const char *new_res_name;
-	int numa_node;
+	struct range range = dax_kmem_range(dev_dax);
+	int numa_node = dev_dax->target_node;
+	struct resource *res;
+	char *res_name;
 	int rc;
 
 	/*
@@ -37,109 +45,94 @@  int dev_dax_kmem_probe(struct device *dev)
 	 * could be mixed in a node with faster memory, causing
 	 * unavoidable performance issues.
 	 */
-	numa_node = dev_dax->target_node;
 	if (numa_node < 0) {
 		dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
 				numa_node);
 		return -EINVAL;
 	}
 
-	/* Hotplug starting at the beginning of the next block: */
-	kmem_start = ALIGN(range->start, memory_block_size_bytes());
-
-	kmem_size = range_len(range);
-	/* Adjust the size down to compensate for moving up kmem_start: */
-	kmem_size -= kmem_start - range->start;
-	/* Align the size down to cover only complete blocks: */
-	kmem_size &= ~(memory_block_size_bytes() - 1);
-	kmem_end = kmem_start + kmem_size;
-
-	new_res_name = kstrdup(dev_name(dev), GFP_KERNEL);
-	if (!new_res_name)
+	res_name = kstrdup(dev_name(dev), GFP_KERNEL);
+	if (!res_name)
 		return -ENOMEM;
 
-	/* Region is permanently reserved if hotremove fails. */
-	new_res = request_mem_region(kmem_start, kmem_size, new_res_name);
-	if (!new_res) {
-		dev_warn(dev, "could not reserve region [%pa-%pa]\n",
-			 &kmem_start, &kmem_end);
-		kfree(new_res_name);
+	res = request_mem_region(range.start, range_len(&range), res_name);
+	if (!res) {
+		dev_warn(dev, "could not reserve region [%#llx-%#llx]\n",
+				range.start, range.end);
+		kfree(res_name);
 		return -EBUSY;
 	}
 
 	/*
-	 * Set flags appropriate for System RAM.  Leave ..._BUSY clear
-	 * so that add_memory() can add a child resource.  Do not
-	 * inherit flags from the parent since it may set new flags
-	 * unknown to us that will break add_memory() below.
+	 * Temporarily clear busy to allow add_memory_driver_managed()
+	 * to claim it.
 	 */
-	new_res->flags = IORESOURCE_SYSTEM_RAM;
+	res->flags &= ~IORESOURCE_BUSY;
 
 	/*
 	 * Ensure that future kexec'd kernels will not treat this as RAM
 	 * automatically.
 	 */
-	rc = add_memory_driver_managed(numa_node, new_res->start,
-				       resource_size(new_res), kmem_name);
+	rc = add_memory_driver_managed(numa_node, res->start,
+				       resource_size(res), kmem_name);
+
+	res->flags |= IORESOURCE_BUSY;
 	if (rc) {
-		release_resource(new_res);
-		kfree(new_res);
-		kfree(new_res_name);
+		release_mem_region(range.start, range_len(&range));
+		kfree(res_name);
 		return rc;
 	}
-	dev_dax->dax_kmem_res = new_res;
+
+	dev_set_drvdata(dev, res_name);
 
 	return 0;
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-static int dev_dax_kmem_remove(struct device *dev)
+static void dax_kmem_release(struct dev_dax *dev_dax)
 {
-	struct dev_dax *dev_dax = to_dev_dax(dev);
-	struct resource *res = dev_dax->dax_kmem_res;
-	resource_size_t kmem_start = res->start;
-	resource_size_t kmem_size = resource_size(res);
-	const char *res_name = res->name;
 	int rc;
+	struct device *dev = &dev_dax->dev;
+	const char *res_name = dev_get_drvdata(dev);
+	struct range range = dax_kmem_range(dev_dax);
 
 	/*
 	 * We have one shot for removing memory, if some memory blocks were not
 	 * offline prior to calling this function remove_memory() will fail, and
 	 * there is no way to hotremove this memory until reboot because device
-	 * unbind will succeed even if we return failure.
+	 * unbind will proceed regardless of the remove_memory result.
 	 */
-	rc = remove_memory(dev_dax->target_node, kmem_start, kmem_size);
-	if (rc) {
-		any_hotremove_failed = true;
-		dev_err(dev,
-			"DAX region %pR cannot be hotremoved until the next reboot\n",
-			res);
-		return rc;
+	rc = remove_memory(dev_dax->target_node, range.start, range_len(&range));
+	if (rc == 0) {
+		release_mem_region(range.start, range_len(&range));
+		dev_set_drvdata(dev, NULL);
+		kfree(res_name);
+		return;
 	}
 
-	/* Release and free dax resources */
-	release_resource(res);
-	kfree(res);
-	kfree(res_name);
-	dev_dax->dax_kmem_res = NULL;
-
-	return 0;
+	any_hotremove_failed = true;
+	dev_err(dev, "%#llx-%#llx cannot be hotremoved until the next reboot\n",
+			range.start, range.end);
 }
 #else
-static int dev_dax_kmem_remove(struct device *dev)
+static void dax_kmem_release(struct dev_dax *dev_dax)
 {
 	/*
-	 * Without hotremove purposely leak the request_mem_region() for the
-	 * device-dax range and return '0' to ->remove() attempts. The removal
-	 * of the device from the driver always succeeds, but the region is
-	 * permanently pinned as reserved by the unreleased
-	 * request_mem_region().
+	 * Without hotremove purposely leak the request_mem_region() for
+	 * the device-dax range attempts. The removal of the device from
+	 * the driver always succeeds, but the region is permanently
+	 * pinned as reserved by the unreleased request_mem_region().
 	 */
 	any_hotremove_failed = true;
-	return 0;
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
+static int dev_dax_kmem_remove(struct device *dev)
+{
+	dax_kmem_release(to_dev_dax(dev));
+	return 0;
+}
+
 static struct dax_device_driver device_dax_kmem_driver = {
 	.drv = {
 		.probe = dev_dax_kmem_probe,