diff mbox series

[ndctl,07/10] daxctl: detect races when onlining memory blocks

Message ID 20191002234925.9190-8-vishal.l.verma@intel.com (mailing list archive)
State Superseded
Commit 86405a79242dfca00c63ee0d1a5f7f33013ccb2d
Headers show
Series fixes and movability for system-ram mode | expand

Commit Message

Verma, Vishal L Oct. 2, 2019, 11:49 p.m. UTC
Sometimes, system configuration can result in new memory blocks getting
onlined automatically. Often, these auto-onlining mechanisms don't
provide a choice or configurability in the matter of which zone is used
for these new blocks, and they just end up in ZONE_NORMAL.

Usually, for hot-plugged memory, ZONE_NORMAL is undesirable because:
 - An application might want total control over this memory
 - ZONE_NORMAL precludes hot-removal of this memory
 - Having kernel data structures in this memory, especially performance
   sensitive ones, such as page tables, may be undesirable.

Thus report if a race condition is encountered while onlining memory,
and provide the user options to remedy it.

Clarify the default zone expectations, and the race detection behavior
in the daxctl-reconfigure-device man page, and move the relevant section
under the 'Description' header, instead of hidden away under the
'--no-online' option.

Cc: Ben Olson <ben.olson@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
---
 daxctl/device.c        |  9 ++++++
 daxctl/lib/libdaxctl.c | 62 +++++++++++++++++++++++++++++-------------
 2 files changed, 52 insertions(+), 19 deletions(-)

Comments

Dan Williams Oct. 18, 2019, 7:26 p.m. UTC | #1
On Wed, Oct 2, 2019 at 4:49 PM Vishal Verma <vishal.l.verma@intel.com> wrote:
>
> Sometimes, system configuration can result in new memory blocks getting
> onlined automatically. Often, these auto-onlining mechanisms don't
> provide a choice or configurability in the matter of which zone is used
> for these new blocks, and they just end up in ZONE_NORMAL.
>
> Usually, for hot-plugged memory, ZONE_NORMAL is undesirable because:
>  - An application might want total control over this memory
>  - ZONE_NORMAL precludes hot-removal of this memory
>  - Having kernel data structures in this memory, especially performance
>    sensitive ones, such as page tables, may be undesirable.
>
> Thus report if a race condition is encountered while onlining memory,
> and provide the user options to remedy it.
>
> Clarify the default zone expectations, and the race detection behavior
> in the daxctl-reconfigure-device man page, and move the relevant section
> under the 'Description' header, instead of hidden away under the
> '--no-online' option.
>
> Cc: Ben Olson <ben.olson@intel.com>
> Cc: Dave Hansen <dave.hansen@linux.intel.com>
> Cc: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
> ---
>  daxctl/device.c        |  9 ++++++
>  daxctl/lib/libdaxctl.c | 62 +++++++++++++++++++++++++++++-------------
>  2 files changed, 52 insertions(+), 19 deletions(-)
>
> diff --git a/daxctl/device.c b/daxctl/device.c
> index 920efc6..28698bf 100644
> --- a/daxctl/device.c
> +++ b/daxctl/device.c
> @@ -174,6 +174,15 @@ static int dev_online_memory(struct daxctl_dev *dev)
>                         devname, strerror(-num_on));
>                 return num_on;
>         }
> +       if (num_on)
> +               fprintf(stderr,
> +                   "%s:\n  WARNING: detected a race while onlining memory\n"
> +                   "  Some memory may not be in the expected zone. It is\n"
> +                   "  recommended to disable any other onlining mechanisms,\n"
> +                   "  and retry. If onlining is to be left to other agents,\n"
> +                   "  use the --no-online option to suppress this warning\n",
> +                   devname);
> +
>         if (num_on == num_sections) {
>                 fprintf(stderr, "%s: all memory sections (%d) already online\n",
>                         devname, num_on);
> diff --git a/daxctl/lib/libdaxctl.c b/daxctl/lib/libdaxctl.c
> index 617887c..5a7e37c 100644
> --- a/daxctl/lib/libdaxctl.c
> +++ b/daxctl/lib/libdaxctl.c
> @@ -1079,10 +1079,10 @@ static int memblock_is_online(struct daxctl_memory *mem, char *memblock)
>         return 0;
>  }
>
> -static int online_one_memblock(struct daxctl_memory *mem, char *memblock)
> +static int online_one_memblock(struct daxctl_memory *mem, char *memblock,
> +               int *status)
>  {
>         struct daxctl_dev *dev = daxctl_memory_get_dev(mem);
> -       const char *devname = daxctl_dev_get_devname(dev);
>         struct daxctl_ctx *ctx = daxctl_dev_get_ctx(dev);
>         const char *mode = "online_movable";
>         int len = mem->buf_len, rc;
> @@ -1097,10 +1097,6 @@ static int online_one_memblock(struct daxctl_memory *mem, char *memblock)
>         if (rc < 0)
>                 return -ENOMEM;
>
> -       /*
> -        * if already online, possibly due to kernel config or a udev rule,
> -        * there is nothing to do and we can skip over the memblock
> -        */
>         rc = memblock_is_online(mem, memblock);
>         if (rc)
>                 return rc;
> @@ -1108,18 +1104,14 @@ static int online_one_memblock(struct daxctl_memory *mem, char *memblock)
>         rc = sysfs_write_attr_quiet(ctx, path, mode);
>         if (rc) {
>                 /*
> -                * While we performed an already-online check above, there
> -                * is still a TOCTOU hole where someone (such as a udev rule)
> -                * may have raced to online the memory. In such a case,
> -                * the sysfs store will fail, however we can check for this
> -                * by simply reading the state again. If it changed to the
> -                * desired state, then we don't have to error out.
> +                * If the block got onlined, potentially by some other agent,
> +                * do nothing for now. There will be a full scan for zone
> +                * correctness later.
>                  */
> -               if (memblock_is_online(mem, memblock))
> -                       return 1;
> -               err(ctx, "%s: Failed to online %s: %s\n",
> -                       devname, path, strerror(-rc));
> +               if (memblock_is_online(mem, memblock) == 1)
> +                       return 0;
>         }
> +
>         return rc;
>  }
>
> @@ -1150,7 +1142,7 @@ static int offline_one_memblock(struct daxctl_memory *mem, char *memblock)
>
>         rc = sysfs_write_attr_quiet(ctx, path, mode);
>         if (rc) {
> -               /* Close the TOCTOU hole like in online_one_memblock() above */
> +               /* check if something raced us to offline (unlikely) */
>                 if (!memblock_is_online(mem, memblock))
>                         return 1;
>                 err(ctx, "%s: Failed to offline %s: %s\n",
> @@ -1274,7 +1266,7 @@ static int op_for_one_memblock(struct daxctl_memory *mem, char *memblock,
>
>         switch (op) {
>         case MEM_SET_ONLINE:
> -               return online_one_memblock(mem, memblock);
> +               return online_one_memblock(mem, memblock, status);
>         case MEM_SET_OFFLINE:
>                 return offline_one_memblock(mem, memblock);
>         case MEM_IS_ONLINE:
> @@ -1349,9 +1341,41 @@ out_dir:
>         return rc;
>  }
>
> +/*
> + * daxctl_memory_online() will online to ZONE_MOVABLE by default
> + */
>  DAXCTL_EXPORT int daxctl_memory_online(struct daxctl_memory *mem)
>  {
> -       return daxctl_memory_op(mem, MEM_SET_ONLINE);
> +       struct daxctl_dev *dev = daxctl_memory_get_dev(mem);
> +       const char *devname = daxctl_dev_get_devname(dev);
> +       struct daxctl_ctx *ctx = daxctl_dev_get_ctx(dev);
> +       int rc;
> +
> +       rc = daxctl_memory_op(mem, MEM_SET_ONLINE);
> +       if (rc)
> +               return rc;
> +
> +       /*
> +        * Detect any potential races when blocks were being brought online by
> +        * checking the zone in which the memory blocks are at this point. If
> +        * any of the blocks are not in ZONE_MOVABLE, emit a warning.
> +        */
> +       mem->zone = 0;
> +       rc = daxctl_memory_op(mem, MEM_FIND_ZONE);
> +       if (rc)
> +               return rc;
> +       if (mem->zone != MEM_ZONE_MOVABLE) {
> +               err(ctx,
> +                   "%s:\n  WARNING: detected a race while onlining memory\n"
> +                   "  Some memory may not be in the expected zone. It is\n"
> +                   "  recommended to disable any other onlining mechanisms,\n"
> +                   "  and retry. If onlining is to be left to other agents,\n"
> +                   "  use the --no-online option to suppress this warning\n",
> +                   devname);

Rather than duplicate this largish warning message what about a
smaller one that references the man page about remediation
possibilities. Then you can also name distro udev rule as one of the
usual suspects that causes this.
diff mbox series

Patch

diff --git a/daxctl/device.c b/daxctl/device.c
index 920efc6..28698bf 100644
--- a/daxctl/device.c
+++ b/daxctl/device.c
@@ -174,6 +174,15 @@  static int dev_online_memory(struct daxctl_dev *dev)
 			devname, strerror(-num_on));
 		return num_on;
 	}
+	if (num_on)
+		fprintf(stderr,
+		    "%s:\n  WARNING: detected a race while onlining memory\n"
+		    "  Some memory may not be in the expected zone. It is\n"
+		    "  recommended to disable any other onlining mechanisms,\n"
+		    "  and retry. If onlining is to be left to other agents,\n"
+		    "  use the --no-online option to suppress this warning\n",
+		    devname);
+
 	if (num_on == num_sections) {
 		fprintf(stderr, "%s: all memory sections (%d) already online\n",
 			devname, num_on);
diff --git a/daxctl/lib/libdaxctl.c b/daxctl/lib/libdaxctl.c
index 617887c..5a7e37c 100644
--- a/daxctl/lib/libdaxctl.c
+++ b/daxctl/lib/libdaxctl.c
@@ -1079,10 +1079,10 @@  static int memblock_is_online(struct daxctl_memory *mem, char *memblock)
 	return 0;
 }
 
-static int online_one_memblock(struct daxctl_memory *mem, char *memblock)
+static int online_one_memblock(struct daxctl_memory *mem, char *memblock,
+		int *status)
 {
 	struct daxctl_dev *dev = daxctl_memory_get_dev(mem);
-	const char *devname = daxctl_dev_get_devname(dev);
 	struct daxctl_ctx *ctx = daxctl_dev_get_ctx(dev);
 	const char *mode = "online_movable";
 	int len = mem->buf_len, rc;
@@ -1097,10 +1097,6 @@  static int online_one_memblock(struct daxctl_memory *mem, char *memblock)
 	if (rc < 0)
 		return -ENOMEM;
 
-	/*
-	 * if already online, possibly due to kernel config or a udev rule,
-	 * there is nothing to do and we can skip over the memblock
-	 */
 	rc = memblock_is_online(mem, memblock);
 	if (rc)
 		return rc;
@@ -1108,18 +1104,14 @@  static int online_one_memblock(struct daxctl_memory *mem, char *memblock)
 	rc = sysfs_write_attr_quiet(ctx, path, mode);
 	if (rc) {
 		/*
-		 * While we performed an already-online check above, there
-		 * is still a TOCTOU hole where someone (such as a udev rule)
-		 * may have raced to online the memory. In such a case,
-		 * the sysfs store will fail, however we can check for this
-		 * by simply reading the state again. If it changed to the
-		 * desired state, then we don't have to error out.
+		 * If the block got onlined, potentially by some other agent,
+		 * do nothing for now. There will be a full scan for zone
+		 * correctness later.
 		 */
-		if (memblock_is_online(mem, memblock))
-			return 1;
-		err(ctx, "%s: Failed to online %s: %s\n",
-			devname, path, strerror(-rc));
+		if (memblock_is_online(mem, memblock) == 1)
+			return 0;
 	}
+
 	return rc;
 }
 
@@ -1150,7 +1142,7 @@  static int offline_one_memblock(struct daxctl_memory *mem, char *memblock)
 
 	rc = sysfs_write_attr_quiet(ctx, path, mode);
 	if (rc) {
-		/* Close the TOCTOU hole like in online_one_memblock() above */
+		/* check if something raced us to offline (unlikely) */
 		if (!memblock_is_online(mem, memblock))
 			return 1;
 		err(ctx, "%s: Failed to offline %s: %s\n",
@@ -1274,7 +1266,7 @@  static int op_for_one_memblock(struct daxctl_memory *mem, char *memblock,
 
 	switch (op) {
 	case MEM_SET_ONLINE:
-		return online_one_memblock(mem, memblock);
+		return online_one_memblock(mem, memblock, status);
 	case MEM_SET_OFFLINE:
 		return offline_one_memblock(mem, memblock);
 	case MEM_IS_ONLINE:
@@ -1349,9 +1341,41 @@  out_dir:
 	return rc;
 }
 
+/*
+ * daxctl_memory_online() will online to ZONE_MOVABLE by default
+ */
 DAXCTL_EXPORT int daxctl_memory_online(struct daxctl_memory *mem)
 {
-	return daxctl_memory_op(mem, MEM_SET_ONLINE);
+	struct daxctl_dev *dev = daxctl_memory_get_dev(mem);
+	const char *devname = daxctl_dev_get_devname(dev);
+	struct daxctl_ctx *ctx = daxctl_dev_get_ctx(dev);
+	int rc;
+
+	rc = daxctl_memory_op(mem, MEM_SET_ONLINE);
+	if (rc)
+		return rc;
+
+	/*
+	 * Detect any potential races when blocks were being brought online by
+	 * checking the zone in which the memory blocks are at this point. If
+	 * any of the blocks are not in ZONE_MOVABLE, emit a warning.
+	 */
+	mem->zone = 0;
+	rc = daxctl_memory_op(mem, MEM_FIND_ZONE);
+	if (rc)
+		return rc;
+	if (mem->zone != MEM_ZONE_MOVABLE) {
+		err(ctx,
+		    "%s:\n  WARNING: detected a race while onlining memory\n"
+		    "  Some memory may not be in the expected zone. It is\n"
+		    "  recommended to disable any other onlining mechanisms,\n"
+		    "  and retry. If onlining is to be left to other agents,\n"
+		    "  use the --no-online option to suppress this warning\n",
+		    devname);
+		return -EBUSY;
+	}
+
+	return rc;
 }
 
 DAXCTL_EXPORT int daxctl_memory_offline(struct daxctl_memory *mem)