diff mbox series

[NDCTL,v3] cxl/region: Add -f option for disable-region

Message ID 169878724592.82931.11180459815481606425.stgit@djiang5-mobl3
State Accepted
Commit 9399aa667ab026a819f8a00c31d9f56bb028e191
Headers show
Series [NDCTL,v3] cxl/region: Add -f option for disable-region | expand

Commit Message

Dave Jiang Oct. 31, 2023, 9:20 p.m. UTC
The current operation for disable-region does not check if the memory
covered by a region is online before attempting to disable the cxl region.
Have the tool attempt to offline the relevant memory before attempting to
disable the region(s). If offline fails, stop and return error.

Provide a -f option for the region to continue disable the region even if
the memory is not offlined. Add a warning to state that the physical
memory is being leaked and unrecoverable unless reboot due to disable without
offline.

Signed-off-by: Dave Jiang <dave.jiang@intel.com>

---
v3:
- Remove movable check. (Dan)
- Attempt to offline if not offline. -f will disable region anyways even
  if memory not offline. (Dan)
v2:
- Update documentation and help output. (Vishal)
---
 Documentation/cxl/cxl-disable-region.txt |   10 ++++++
 cxl/region.c                             |   54 +++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 1 deletion(-)

Comments

Alison Schofield Nov. 2, 2023, 10:51 p.m. UTC | #1
On Tue, Oct 31, 2023 at 02:20:45PM -0700, Dave Jiang wrote:

snip

> +	Attempt to disable-region even though memory cannot be offlined successfully.
> +	Will emit warning that operation will permanently leak phiscal address space
> +	and cannot be recovered until a reboot.

physical

With that,
Reviewed-by: Alison Schofield <alison.schofield@intel.com>

> 
> 
>
Fan Ni Nov. 9, 2023, 7:32 p.m. UTC | #2
On Tue, Oct 31, 2023 at 02:20:45PM -0700, Dave Jiang wrote:
> The current operation for disable-region does not check if the memory
> covered by a region is online before attempting to disable the cxl region.
> Have the tool attempt to offline the relevant memory before attempting to
> disable the region(s). If offline fails, stop and return error.
> 
> Provide a -f option for the region to continue disable the region even if
> the memory is not offlined. Add a warning to state that the physical
> memory is being leaked and unrecoverable unless reboot due to disable without
> offline.
> 
> Signed-off-by: Dave Jiang <dave.jiang@intel.com>

Reviewed-by: Fan Ni <fan.ni@samsung.com>

> 
> ---
> v3:
> - Remove movable check. (Dan)
> - Attempt to offline if not offline. -f will disable region anyways even
>   if memory not offline. (Dan)
> v2:
> - Update documentation and help output. (Vishal)
> ---
>  Documentation/cxl/cxl-disable-region.txt |   10 ++++++
>  cxl/region.c                             |   54 +++++++++++++++++++++++++++++-
>  2 files changed, 63 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/cxl/cxl-disable-region.txt b/Documentation/cxl/cxl-disable-region.txt
> index 4b0625e40bf6..9abf19e96094 100644
> --- a/Documentation/cxl/cxl-disable-region.txt
> +++ b/Documentation/cxl/cxl-disable-region.txt
> @@ -14,6 +14,10 @@ SYNOPSIS
>  
>  include::region-description.txt[]
>  
> +If there are memory blocks that are still online, the operation will attempt to
> +offline the relevant blocks. If the offlining fails, the operation fails when not
> +using the -f (force) parameter.
> +
>  EXAMPLE
>  -------
>  ----
> @@ -27,6 +31,12 @@ OPTIONS
>  -------
>  include::bus-option.txt[]
>  
> +-f::
> +--force::
> +	Attempt to disable-region even though memory cannot be offlined successfully.
> +	Will emit warning that operation will permanently leak phiscal address space
> +	and cannot be recovered until a reboot.
> +
>  include::decoder-option.txt[]
>  
>  include::debug-option.txt[]
> diff --git a/cxl/region.c b/cxl/region.c
> index bcd703956207..5cbbf2749e2d 100644
> --- a/cxl/region.c
> +++ b/cxl/region.c
> @@ -14,6 +14,7 @@
>  #include <util/parse-options.h>
>  #include <ccan/minmax/minmax.h>
>  #include <ccan/short_types/short_types.h>
> +#include <daxctl/libdaxctl.h>
>  
>  #include "filter.h"
>  #include "json.h"
> @@ -95,6 +96,8 @@ static const struct option enable_options[] = {
>  
>  static const struct option disable_options[] = {
>  	BASE_OPTIONS(),
> +	OPT_BOOLEAN('f', "force", &param.force,
> +		    "attempt to offline memory before disabling the region"),
>  	OPT_END(),
>  };
>  
> @@ -789,13 +792,62 @@ static int destroy_region(struct cxl_region *region)
>  	return cxl_region_delete(region);
>  }
>  
> +static int disable_region(struct cxl_region *region)
> +{
> +	const char *devname = cxl_region_get_devname(region);
> +	struct daxctl_region *dax_region;
> +	struct daxctl_memory *mem;
> +	struct daxctl_dev *dev;
> +	int failed = 0, rc;
> +
> +	dax_region = cxl_region_get_daxctl_region(region);
> +	if (!dax_region)
> +		goto out;
> +
> +	daxctl_dev_foreach(dax_region, dev) {
> +		mem = daxctl_dev_get_memory(dev);
> +		if (!mem)
> +			return -ENXIO;
> +
> +		/*
> +		 * If memory is still online and user wants to force it, attempt
> +		 * to offline it.
> +		 */
> +		if (daxctl_memory_is_online(mem)) {
> +			rc = daxctl_memory_offline(mem);
> +			if (rc < 0) {
> +				log_err(&rl, "%s: unable to offline %s: %s\n",
> +					devname,
> +					daxctl_dev_get_devname(dev),
> +					strerror(abs(rc)));
> +				if (!param.force)
> +					return rc;
> +
> +				failed++;
> +			}
> +		}
> +	}
> +
> +	if (failed) {
> +		log_err(&rl, "%s: Forcing region disable without successful offline.\n",
> +			devname);
> +		log_err(&rl, "%s: Physical address space has now been permanently leaked.\n",
> +			devname);
> +		log_err(&rl, "%s: Leaked address cannot be recovered until a reboot.\n",
> +			devname);
> +	}
> +
> +out:
> +	return cxl_region_disable(region);
> +}
> +
>  static int do_region_xable(struct cxl_region *region, enum region_actions action)
>  {
>  	switch (action) {
>  	case ACTION_ENABLE:
>  		return cxl_region_enable(region);
>  	case ACTION_DISABLE:
> -		return cxl_region_disable(region);
> +		return disable_region(region);
>  	case ACTION_DESTROY:
>  		return destroy_region(region);
>  	default:
> 
>
Cao, Quanquan/曹 全全 Nov. 27, 2023, 9:34 a.m. UTC | #3
> +static int disable_region(struct cxl_region *region)
> +{
> +	const char *devname = cxl_region_get_devname(region);
> +	struct daxctl_region *dax_region;
> +	struct daxctl_memory *mem;
> +	struct daxctl_dev *dev;
> +	int failed = 0, rc;
> +
> +	dax_region = cxl_region_get_daxctl_region(region);
> +	if (!dax_region)
> +		goto out;
> +
> +	daxctl_dev_foreach(dax_region, dev) {
> +		mem = daxctl_dev_get_memory(dev);
> +		if (!mem)
> +			return -ENXIO;
> +
> +		/*
> +		 * If memory is still online and user wants to force it, attempt
> +		 * to offline it.
> +		 */
> +		if (daxctl_memory_is_online(mem)) {
> +			rc = daxctl_memory_offline(mem);
> +			if (rc < 0) {
> +				log_err(&rl, "%s: unable to offline %s: %s\n",
> +					devname,
> +					daxctl_dev_get_devname(dev),
> +					strerror(abs(rc)));
> +				if (!param.force)
> +					return rc;
> +
> +				failed++;
> +			}
> +		}
> +	}
> +
> +	if (failed) {
> +		log_err(&rl, "%s: Forcing region disable without successful offline.\n",
> +			devname);
> +		log_err(&rl, "%s: Physical address space has now been permanently leaked.\n",
> +			devname);
> +		log_err(&rl, "%s: Leaked address cannot be recovered until a reboot.\n",
> +			devname);
> +	}
> +

>   static int do_region_xable(struct cxl_region *region, enum region_actions action)
>   {
>   	switch (action) {
>   	case ACTION_ENABLE:
>   		return cxl_region_enable(region);
>   	case ACTION_DISABLE:
> -		return cxl_region_disable(region);
> +		return disable_region(region);
>   	case ACTION_DESTROY:
>   		return destroy_region(region);
>   	default:

Hi Dave

In this patch, a new function 'disable_region(region)' has been added. 
When using the 'cxl destroy-region region0 -f' command, there's a check 
first, followed by the 'destroy-region' operation. In terms of 
user-friendliness, which function is more user-friendly: 
'cxl_region_disable(region)' or 'disable_region(region)'?

Attach destroy_region section code
static int destroy_region(struct cxl_region *region)
{
     const char *devname = cxl_region_get_devname(region);
     unsigned int ways, i;
     int rc;

     /* First, unbind/disable the region if needed */
     if (cxl_region_is_enabled(region)) {
         if (param.force) {
             rc = cxl_region_disable(region);
             if (rc) {
                 log_err(&rl, "%s: error disabling region: %s\n",
                     devname, strerror(-rc));
                 return rc;
             }
         } else {
             log_err(&rl, "%s active. Disable it or use --force\n",
                 devname);
             return -EBUSY;
         }
     }

I have considered two options for your reference:

1.Assuming the user hasn't executed the 'cxl disable-region region0' 
command and directly runs 'cxl destroy-region region0 -f', using the 
'disable_region(region)' function to first take the region offline and 
then disable it might be more user-friendly.
2.If the user executes the 'cxl disable-region region0' command but 
fails to take it offline successfully, then runs 'cxl destroy-region 
region0 -f', using the 'cxl_region_disable(region)' function to directly 
'disable region' and then 'destroy region' would also be reasonable.
Dave Jiang Nov. 27, 2023, 5:13 p.m. UTC | #4
On 11/27/23 02:34, Cao, Quanquan/曹 全全 wrote:
> 
> 
>> +static int disable_region(struct cxl_region *region)
>> +{
>> +    const char *devname = cxl_region_get_devname(region);
>> +    struct daxctl_region *dax_region;
>> +    struct daxctl_memory *mem;
>> +    struct daxctl_dev *dev;
>> +    int failed = 0, rc;
>> +
>> +    dax_region = cxl_region_get_daxctl_region(region);
>> +    if (!dax_region)
>> +        goto out;
>> +
>> +    daxctl_dev_foreach(dax_region, dev) {
>> +        mem = daxctl_dev_get_memory(dev);
>> +        if (!mem)
>> +            return -ENXIO;
>> +
>> +        /*
>> +         * If memory is still online and user wants to force it, attempt
>> +         * to offline it.
>> +         */
>> +        if (daxctl_memory_is_online(mem)) {
>> +            rc = daxctl_memory_offline(mem);
>> +            if (rc < 0) {
>> +                log_err(&rl, "%s: unable to offline %s: %s\n",
>> +                    devname,
>> +                    daxctl_dev_get_devname(dev),
>> +                    strerror(abs(rc)));
>> +                if (!param.force)
>> +                    return rc;
>> +
>> +                failed++;
>> +            }
>> +        }
>> +    }
>> +
>> +    if (failed) {
>> +        log_err(&rl, "%s: Forcing region disable without successful offline.\n",
>> +            devname);
>> +        log_err(&rl, "%s: Physical address space has now been permanently leaked.\n",
>> +            devname);
>> +        log_err(&rl, "%s: Leaked address cannot be recovered until a reboot.\n",
>> +            devname);
>> +    }
>> +
> 
>>   static int do_region_xable(struct cxl_region *region, enum region_actions action)
>>   {
>>       switch (action) {
>>       case ACTION_ENABLE:
>>           return cxl_region_enable(region);
>>       case ACTION_DISABLE:
>> -        return cxl_region_disable(region);
>> +        return disable_region(region);
>>       case ACTION_DESTROY:
>>           return destroy_region(region);
>>       default:
> 
> Hi Dave
> 
> In this patch, a new function 'disable_region(region)' has been added. When using the 'cxl destroy-region region0 -f' command, there's a check first, followed by the 'destroy-region' operation. In terms of user-friendliness, which function is more user-friendly: 'cxl_region_disable(region)' or 'disable_region(region)'?
> 
> Attach destroy_region section code
> static int destroy_region(struct cxl_region *region)
> {
>     const char *devname = cxl_region_get_devname(region);
>     unsigned int ways, i;
>     int rc;
> 
>     /* First, unbind/disable the region if needed */
>     if (cxl_region_is_enabled(region)) {
>         if (param.force) {
>             rc = cxl_region_disable(region);
>             if (rc) {
>                 log_err(&rl, "%s: error disabling region: %s\n",
>                     devname, strerror(-rc));
>                 return rc;
>             }
>         } else {
>             log_err(&rl, "%s active. Disable it or use --force\n",
>                 devname);
>             return -EBUSY;
>         }
>     }
> 
> I have considered two options for your reference:
> 
> 1.Assuming the user hasn't executed the 'cxl disable-region region0' command and directly runs 'cxl destroy-region region0 -f', using the 'disable_region(region)' function to first take the region offline and then disable it might be more user-friendly.
> 2.If the user executes the 'cxl disable-region region0' command but fails to take it offline successfully, then runs 'cxl destroy-region region0 -f', using the 'cxl_region_disable(region)' function to directly 'disable region' and then 'destroy region' would also be reasonable.

To make the behavior consistent, I think we should use disable_region() with the check for the destroy_region() path.

What do you think Vishal?
> 
> 
> 
> 
>
Verma, Vishal L Nov. 27, 2023, 5:58 p.m. UTC | #5
On Mon, 2023-11-27 at 10:13 -0700, Dave Jiang wrote:
> On 11/27/23 02:34, Cao, Quanquan/曹 全全 wrote:
> > 
> > 
> > 1.Assuming the user hasn't executed the 'cxl disable-region
> > region0' command and directly runs 'cxl destroy-region region0 -f',
> > using the 'disable_region(region)' function to first take the
> > region offline and then disable it might be more user-friendly.
> > 2.If the user executes the 'cxl disable-region region0' command but
> > fails to take it offline successfully, then runs 'cxl destroy-
> > region region0 -f', using the 'cxl_region_disable(region)' function
> > to directly 'disable region' and then 'destroy region' would also
> > be reasonable.
> 
> To make the behavior consistent, I think we should use
> disable_region() with the check for the destroy_region() path.
> 
> What do you think Vishal?
> > 
Yep agreed, using the new helper in destroy-region also makes sense. Do
you want to send a new patch for this - I've already applied this
series to the pending branch.
Dave Jiang Nov. 27, 2023, 11:28 p.m. UTC | #6
On 11/27/23 10:58, Verma, Vishal L wrote:
> On Mon, 2023-11-27 at 10:13 -0700, Dave Jiang wrote:
>> On 11/27/23 02:34, Cao, Quanquan/曹 全全 wrote:
>>>
>>>
>>> 1.Assuming the user hasn't executed the 'cxl disable-region
>>> region0' command and directly runs 'cxl destroy-region region0 -f',
>>> using the 'disable_region(region)' function to first take the
>>> region offline and then disable it might be more user-friendly.
>>> 2.If the user executes the 'cxl disable-region region0' command but
>>> fails to take it offline successfully, then runs 'cxl destroy-
>>> region region0 -f', using the 'cxl_region_disable(region)' function
>>> to directly 'disable region' and then 'destroy region' would also
>>> be reasonable.
>>
>> To make the behavior consistent, I think we should use
>> disable_region() with the check for the destroy_region() path.
>>
>> What do you think Vishal?
>>>
> Yep agreed, using the new helper in destroy-region also makes sense. Do
> you want to send a new patch for this - I've already applied this
> series to the pending branch.

Yeah I can send a new patch.
diff mbox series

Patch

diff --git a/Documentation/cxl/cxl-disable-region.txt b/Documentation/cxl/cxl-disable-region.txt
index 4b0625e40bf6..9abf19e96094 100644
--- a/Documentation/cxl/cxl-disable-region.txt
+++ b/Documentation/cxl/cxl-disable-region.txt
@@ -14,6 +14,10 @@  SYNOPSIS
 
 include::region-description.txt[]
 
+If there are memory blocks that are still online, the operation will attempt to
+offline the relevant blocks. If the offlining fails, the operation fails when not
+using the -f (force) parameter.
+
 EXAMPLE
 -------
 ----
@@ -27,6 +31,12 @@  OPTIONS
 -------
 include::bus-option.txt[]
 
+-f::
+--force::
+	Attempt to disable-region even though memory cannot be offlined successfully.
+	Will emit warning that operation will permanently leak phiscal address space
+	and cannot be recovered until a reboot.
+
 include::decoder-option.txt[]
 
 include::debug-option.txt[]
diff --git a/cxl/region.c b/cxl/region.c
index bcd703956207..5cbbf2749e2d 100644
--- a/cxl/region.c
+++ b/cxl/region.c
@@ -14,6 +14,7 @@ 
 #include <util/parse-options.h>
 #include <ccan/minmax/minmax.h>
 #include <ccan/short_types/short_types.h>
+#include <daxctl/libdaxctl.h>
 
 #include "filter.h"
 #include "json.h"
@@ -95,6 +96,8 @@  static const struct option enable_options[] = {
 
 static const struct option disable_options[] = {
 	BASE_OPTIONS(),
+	OPT_BOOLEAN('f', "force", &param.force,
+		    "attempt to offline memory before disabling the region"),
 	OPT_END(),
 };
 
@@ -789,13 +792,62 @@  static int destroy_region(struct cxl_region *region)
 	return cxl_region_delete(region);
 }
 
+static int disable_region(struct cxl_region *region)
+{
+	const char *devname = cxl_region_get_devname(region);
+	struct daxctl_region *dax_region;
+	struct daxctl_memory *mem;
+	struct daxctl_dev *dev;
+	int failed = 0, rc;
+
+	dax_region = cxl_region_get_daxctl_region(region);
+	if (!dax_region)
+		goto out;
+
+	daxctl_dev_foreach(dax_region, dev) {
+		mem = daxctl_dev_get_memory(dev);
+		if (!mem)
+			return -ENXIO;
+
+		/*
+		 * If memory is still online and user wants to force it, attempt
+		 * to offline it.
+		 */
+		if (daxctl_memory_is_online(mem)) {
+			rc = daxctl_memory_offline(mem);
+			if (rc < 0) {
+				log_err(&rl, "%s: unable to offline %s: %s\n",
+					devname,
+					daxctl_dev_get_devname(dev),
+					strerror(abs(rc)));
+				if (!param.force)
+					return rc;
+
+				failed++;
+			}
+		}
+	}
+
+	if (failed) {
+		log_err(&rl, "%s: Forcing region disable without successful offline.\n",
+			devname);
+		log_err(&rl, "%s: Physical address space has now been permanently leaked.\n",
+			devname);
+		log_err(&rl, "%s: Leaked address cannot be recovered until a reboot.\n",
+			devname);
+	}
+
+out:
+	return cxl_region_disable(region);
+}
+
 static int do_region_xable(struct cxl_region *region, enum region_actions action)
 {
 	switch (action) {
 	case ACTION_ENABLE:
 		return cxl_region_enable(region);
 	case ACTION_DISABLE:
-		return cxl_region_disable(region);
+		return disable_region(region);
 	case ACTION_DESTROY:
 		return destroy_region(region);
 	default: