diff mbox series

[v4,13/14] memblock: Add KHO support for reserve_mem

Message ID 20250206132754.2596694-14-rppt@kernel.org (mailing list archive)
State New
Headers show
Series kexec: introduce Kexec HandOver (KHO) | expand

Commit Message

Mike Rapoport Feb. 6, 2025, 1:27 p.m. UTC
From: Alexander Graf <graf@amazon.com>

Linux has recently gained support for "reserve_mem": A mechanism to
allocate a region of memory early enough in boot that we can cross our
fingers and hope it stays at the same location during most boots, so we
can store for example ftrace buffers into it.

Thanks to KASLR, we can never be really sure that "reserve_mem"
allocations are static across kexec. Let's teach it KHO awareness so
that it serializes its reservations on kexec exit and deserializes them
again on boot, preserving the exact same mapping across kexec.

This is an example user for KHO in the KHO patch set to ensure we have
at least one (not very controversial) user in the tree before extending
KHO's use to more subsystems.

Signed-off-by: Alexander Graf <graf@amazon.com>
Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
 mm/memblock.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)

Comments

Rob Herring Feb. 10, 2025, 4:03 p.m. UTC | #1
On Thu, Feb 6, 2025 at 7:30 AM Mike Rapoport <rppt@kernel.org> wrote:
>
> From: Alexander Graf <graf@amazon.com>
>
> Linux has recently gained support for "reserve_mem": A mechanism to
> allocate a region of memory early enough in boot that we can cross our
> fingers and hope it stays at the same location during most boots, so we
> can store for example ftrace buffers into it.
>
> Thanks to KASLR, we can never be really sure that "reserve_mem"
> allocations are static across kexec. Let's teach it KHO awareness so
> that it serializes its reservations on kexec exit and deserializes them
> again on boot, preserving the exact same mapping across kexec.
>
> This is an example user for KHO in the KHO patch set to ensure we have
> at least one (not very controversial) user in the tree before extending
> KHO's use to more subsystems.
>
> Signed-off-by: Alexander Graf <graf@amazon.com>
> Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> ---
>  mm/memblock.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 131 insertions(+)
>
> diff --git a/mm/memblock.c b/mm/memblock.c
> index 84df96efca62..fdb08b60efc1 100644
> --- a/mm/memblock.c
> +++ b/mm/memblock.c
> @@ -16,6 +16,9 @@
>  #include <linux/kmemleak.h>
>  #include <linux/seq_file.h>
>  #include <linux/memblock.h>
> +#include <linux/kexec_handover.h>
> +#include <linux/kexec.h>
> +#include <linux/libfdt.h>
>
>  #include <asm/sections.h>
>  #include <linux/io.h>
> @@ -2423,6 +2426,70 @@ int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *
>  }
>  EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
>
> +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
> +                                         phys_addr_t align)
> +{
> +       const void *fdt = kho_get_fdt();
> +       const char *path = "/reserve_mem";
> +       int node, child, err;
> +
> +       if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
> +               return false;
> +
> +       if (!fdt)
> +               return false;
> +
> +       node = fdt_path_offset(fdt, "/reserve_mem");
> +       if (node < 0)
> +               return false;
> +
> +       err = fdt_node_check_compatible(fdt, node, "reserve_mem-v1");
> +       if (err) {
> +               pr_warn("Node '%s' has unknown compatible", path);
> +               return false;
> +       }
> +
> +       fdt_for_each_subnode(child, fdt, node) {
> +               const struct kho_mem *mem;
> +               const char *child_name;
> +               int len;
> +
> +               /* Search for old kernel's reserved_mem with the same name */
> +               child_name = fdt_get_name(fdt, child, NULL);
> +               if (strcmp(name, child_name))
> +                       continue;
> +
> +               err = fdt_node_check_compatible(fdt, child, "reserve_mem_map-v1");

It really seems you all are trying to have things both ways. It's not
Devicetree, just the FDT file format, but then here you use
"compatible" which *is* Devicetree. At best, it's all just confusing
for folks. At worst, you're just picking and choosing what you want to
use.

I'm not saying don't use "compatible" just for the sake of looking
less like DT, but perhaps your versioning should be done differently.
You are reading the 'mem' property straight into a struct. Maybe the
struct should have a version. Or the size of the struct is the version
much like the userspace ABI is handled for structs.

> +               if (err) {
> +                       pr_warn("Node '%s/%s' has unknown compatible", path, name);
> +                       continue;
> +               }
> +
> +               mem = fdt_getprop(fdt, child, "mem", &len);
> +               if (!mem || len != sizeof(*mem))
> +                       continue;
> +
> +               if (mem->addr & (align - 1)) {

It's stated somewhere in this that the FDT data is LE, but here you
are assuming the FDT is the same endianness as the CPU not that it's
LE. Arm64 can do BE. PowerPC does both. I'm not sure if kexec from one
endianness to another is possible. I would guess in theory it is and
in practice it's broken already (because kexec is always an
afterthought). Either you need to guarantee that native endianness
will never be an issue for any arch or you need to make the endianness
fixed.

Rob
Mike Rapoport Feb. 12, 2025, 4:30 p.m. UTC | #2
On Mon, Feb 10, 2025 at 10:03:58AM -0600, Rob Herring wrote:
> On Thu, Feb 6, 2025 at 7:30 AM Mike Rapoport <rppt@kernel.org> wrote:
> >
> > From: Alexander Graf <graf@amazon.com>
> >
> > Linux has recently gained support for "reserve_mem": A mechanism to
> > allocate a region of memory early enough in boot that we can cross our
> > fingers and hope it stays at the same location during most boots, so we
> > can store for example ftrace buffers into it.
> >
> > Thanks to KASLR, we can never be really sure that "reserve_mem"
> > allocations are static across kexec. Let's teach it KHO awareness so
> > that it serializes its reservations on kexec exit and deserializes them
> > again on boot, preserving the exact same mapping across kexec.
> >
> > This is an example user for KHO in the KHO patch set to ensure we have
> > at least one (not very controversial) user in the tree before extending
> > KHO's use to more subsystems.
> >
> > Signed-off-by: Alexander Graf <graf@amazon.com>
> > Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> > ---
> >  mm/memblock.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 131 insertions(+)
> >
> > diff --git a/mm/memblock.c b/mm/memblock.c
> > index 84df96efca62..fdb08b60efc1 100644
> > --- a/mm/memblock.c
> > +++ b/mm/memblock.c
> > @@ -16,6 +16,9 @@
> >  #include <linux/kmemleak.h>
> >  #include <linux/seq_file.h>
> >  #include <linux/memblock.h>
> > +#include <linux/kexec_handover.h>
> > +#include <linux/kexec.h>
> > +#include <linux/libfdt.h>
> >
> >  #include <asm/sections.h>
> >  #include <linux/io.h>
> > @@ -2423,6 +2426,70 @@ int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *
> >  }
> >  EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
> >
> > +static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
> > +                                         phys_addr_t align)
> > +{
> > +       const void *fdt = kho_get_fdt();
> > +       const char *path = "/reserve_mem";
> > +       int node, child, err;
> > +
> > +       if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
> > +               return false;
> > +
> > +       if (!fdt)
> > +               return false;
> > +
> > +       node = fdt_path_offset(fdt, "/reserve_mem");
> > +       if (node < 0)
> > +               return false;
> > +
> > +       err = fdt_node_check_compatible(fdt, node, "reserve_mem-v1");
> > +       if (err) {
> > +               pr_warn("Node '%s' has unknown compatible", path);
> > +               return false;
> > +       }
> > +
> > +       fdt_for_each_subnode(child, fdt, node) {
> > +               const struct kho_mem *mem;
> > +               const char *child_name;
> > +               int len;
> > +
> > +               /* Search for old kernel's reserved_mem with the same name */
> > +               child_name = fdt_get_name(fdt, child, NULL);
> > +               if (strcmp(name, child_name))
> > +                       continue;
> > +
> > +               err = fdt_node_check_compatible(fdt, child, "reserve_mem_map-v1");
> 
> It really seems you all are trying to have things both ways. It's not
> Devicetree, just the FDT file format, but then here you use
> "compatible" which *is* Devicetree. At best, it's all just confusing
> for folks. At worst, you're just picking and choosing what you want to
> use.
> 
> I'm not saying don't use "compatible" just for the sake of looking
> less like DT, but perhaps your versioning should be done differently.
> You are reading the 'mem' property straight into a struct. Maybe the
> struct should have a version. Or the size of the struct is the version
> much like the userspace ABI is handled for structs.

The idea is to have high level compatibility notion for node level and up
rather than verify that for each and every struct like uABI does.
For that "compatible" seems just a perfect fit.
 
> > +               if (err) {
> > +                       pr_warn("Node '%s/%s' has unknown compatible", path, name);
> > +                       continue;
> > +               }
> > +
> > +               mem = fdt_getprop(fdt, child, "mem", &len);
> > +               if (!mem || len != sizeof(*mem))
> > +                       continue;
> > +
> > +               if (mem->addr & (align - 1)) {
> 
> It's stated somewhere in this that the FDT data is LE, but here you
> are assuming the FDT is the same endianness as the CPU not that it's
> LE. Arm64 can do BE. PowerPC does both. I'm not sure if kexec from one
> endianness to another is possible. I would guess in theory it is and
> in practice it's broken already (because kexec is always an
> afterthought). Either you need to guarantee that native endianness
> will never be an issue for any arch or you need to make the endianness
> fixed.

I believe Alex mentioned little endian in the sense of native endianness
for practical purposes :)

Since arm64 does seem to support kexec from one endianness to another in
certain circumstances, but I believe that we can limit KHO only to work
when both kernels have the same endianness.
 
> Rob
Wei Yang Feb. 17, 2025, 4:04 a.m. UTC | #3
On Thu, Feb 06, 2025 at 03:27:53PM +0200, Mike Rapoport wrote:
>From: Alexander Graf <graf@amazon.com>
>
>Linux has recently gained support for "reserve_mem": A mechanism to
>allocate a region of memory early enough in boot that we can cross our
>fingers and hope it stays at the same location during most boots, so we
>can store for example ftrace buffers into it.
>
>Thanks to KASLR, we can never be really sure that "reserve_mem"
>allocations are static across kexec. Let's teach it KHO awareness so
>that it serializes its reservations on kexec exit and deserializes them
>again on boot, preserving the exact same mapping across kexec.
>
>This is an example user for KHO in the KHO patch set to ensure we have
>at least one (not very controversial) user in the tree before extending
>KHO's use to more subsystems.
>
>Signed-off-by: Alexander Graf <graf@amazon.com>
>Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
>Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
>---
> mm/memblock.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 131 insertions(+)
>
>diff --git a/mm/memblock.c b/mm/memblock.c
>index 84df96efca62..fdb08b60efc1 100644
>--- a/mm/memblock.c
>+++ b/mm/memblock.c
>@@ -16,6 +16,9 @@
> #include <linux/kmemleak.h>
> #include <linux/seq_file.h>
> #include <linux/memblock.h>
>+#include <linux/kexec_handover.h>

Looks this one breaks the memblock test in tools/testing/memblock.

memblock.c:19:10: fatal error: linux/kexec_handover.h: No such file or directory
   19 | #include <linux/kexec_handover.h>
      |          ^~~~~~~~~~~~~~~~~~~~~~~~

>+#include <linux/kexec.h>
>+#include <linux/libfdt.h>
>
Mike Rapoport Feb. 19, 2025, 7:25 a.m. UTC | #4
On Mon, Feb 17, 2025 at 04:04:48AM +0000, Wei Yang wrote:
> On Thu, Feb 06, 2025 at 03:27:53PM +0200, Mike Rapoport wrote:
> >From: Alexander Graf <graf@amazon.com>
> >
> >Linux has recently gained support for "reserve_mem": A mechanism to
> >allocate a region of memory early enough in boot that we can cross our
> >fingers and hope it stays at the same location during most boots, so we
> >can store for example ftrace buffers into it.
> >
> >Thanks to KASLR, we can never be really sure that "reserve_mem"
> >allocations are static across kexec. Let's teach it KHO awareness so
> >that it serializes its reservations on kexec exit and deserializes them
> >again on boot, preserving the exact same mapping across kexec.
> >
> >This is an example user for KHO in the KHO patch set to ensure we have
> >at least one (not very controversial) user in the tree before extending
> >KHO's use to more subsystems.
> >
> >Signed-off-by: Alexander Graf <graf@amazon.com>
> >Co-developed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> >Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> >---
> > mm/memblock.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 131 insertions(+)
> >
> >diff --git a/mm/memblock.c b/mm/memblock.c
> >index 84df96efca62..fdb08b60efc1 100644
> >--- a/mm/memblock.c
> >+++ b/mm/memblock.c
> >@@ -16,6 +16,9 @@
> > #include <linux/kmemleak.h>
> > #include <linux/seq_file.h>
> > #include <linux/memblock.h>
> >+#include <linux/kexec_handover.h>
> 
> Looks this one breaks the memblock test in tools/testing/memblock.
> 
> memblock.c:19:10: fatal error: linux/kexec_handover.h: No such file or directory
>    19 | #include <linux/kexec_handover.h>
>       |          ^~~~~~~~~~~~~~~~~~~~~~~~

Thanks, will fix.
 
> >+#include <linux/kexec.h>
> >+#include <linux/libfdt.h>
> > 
> 
> -- 
> Wei Yang
> Help you, Help me
diff mbox series

Patch

diff --git a/mm/memblock.c b/mm/memblock.c
index 84df96efca62..fdb08b60efc1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -16,6 +16,9 @@ 
 #include <linux/kmemleak.h>
 #include <linux/seq_file.h>
 #include <linux/memblock.h>
+#include <linux/kexec_handover.h>
+#include <linux/kexec.h>
+#include <linux/libfdt.h>
 
 #include <asm/sections.h>
 #include <linux/io.h>
@@ -2423,6 +2426,70 @@  int reserve_mem_find_by_name(const char *name, phys_addr_t *start, phys_addr_t *
 }
 EXPORT_SYMBOL_GPL(reserve_mem_find_by_name);
 
+static bool __init reserve_mem_kho_revive(const char *name, phys_addr_t size,
+					  phys_addr_t align)
+{
+	const void *fdt = kho_get_fdt();
+	const char *path = "/reserve_mem";
+	int node, child, err;
+
+	if (!IS_ENABLED(CONFIG_KEXEC_HANDOVER))
+		return false;
+
+	if (!fdt)
+		return false;
+
+	node = fdt_path_offset(fdt, "/reserve_mem");
+	if (node < 0)
+		return false;
+
+	err = fdt_node_check_compatible(fdt, node, "reserve_mem-v1");
+	if (err) {
+		pr_warn("Node '%s' has unknown compatible", path);
+		return false;
+	}
+
+	fdt_for_each_subnode(child, fdt, node) {
+		const struct kho_mem *mem;
+		const char *child_name;
+		int len;
+
+		/* Search for old kernel's reserved_mem with the same name */
+		child_name = fdt_get_name(fdt, child, NULL);
+		if (strcmp(name, child_name))
+			continue;
+
+		err = fdt_node_check_compatible(fdt, child, "reserve_mem_map-v1");
+		if (err) {
+			pr_warn("Node '%s/%s' has unknown compatible", path, name);
+			continue;
+		}
+
+		mem = fdt_getprop(fdt, child, "mem", &len);
+		if (!mem || len != sizeof(*mem))
+			continue;
+
+		if (mem->addr & (align - 1)) {
+			pr_warn("KHO reserved_mem '%s' has wrong alignment (0x%lx, 0x%lx)",
+				name, (long)align, (long)mem->addr);
+			continue;
+		}
+
+		if (mem->size != size) {
+			pr_warn("KHO reserved_mem '%s' has wrong size (0x%lx != 0x%lx)",
+				name, (long)mem->size, (long)size);
+			continue;
+		}
+
+		reserved_mem_add(mem->addr, mem->size, name);
+		pr_info("Revived memory reservation '%s' from KHO", name);
+
+		return true;
+	}
+
+	return false;
+}
+
 /*
  * Parse reserve_mem=nn:align:name
  */
@@ -2478,6 +2545,11 @@  static int __init reserve_mem(char *p)
 	if (reserve_mem_find_by_name(name, &start, &tmp))
 		return -EBUSY;
 
+	/* Pick previous allocations up from KHO if available */
+	if (reserve_mem_kho_revive(name, size, align))
+		return 1;
+
+	/* TODO: Allocation must be outside of scratch region */
 	start = memblock_phys_alloc(size, align);
 	if (!start)
 		return -ENOMEM;
@@ -2488,6 +2560,65 @@  static int __init reserve_mem(char *p)
 }
 __setup("reserve_mem=", reserve_mem);
 
+static int reserve_mem_kho_write_map(void *fdt, struct reserve_mem_table *map)
+{
+	int err = 0;
+	const char compatible[] = "reserve_mem_map-v1";
+	struct kho_mem mem = {
+		.addr = map->start,
+		.size = map->size,
+	};
+
+	err |= fdt_begin_node(fdt, map->name);
+	err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+	err |= fdt_property(fdt, "mem", &mem, sizeof(mem));
+	err |= fdt_end_node(fdt);
+
+	return err;
+}
+
+static int reserve_mem_kho_notifier(struct notifier_block *self,
+				    unsigned long cmd, void *v)
+{
+	const char compatible[] = "reserve_mem-v1";
+	void *fdt = v;
+	int err = 0;
+	int i;
+
+	switch (cmd) {
+	case KEXEC_KHO_ABORT:
+		return NOTIFY_DONE;
+	case KEXEC_KHO_DUMP:
+		/* Handled below */
+		break;
+	default:
+		return NOTIFY_BAD;
+	}
+
+	if (!reserved_mem_count)
+		return NOTIFY_DONE;
+
+	err |= fdt_begin_node(fdt, "reserve_mem");
+	err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+	for (i = 0; i < reserved_mem_count; i++)
+		err |= reserve_mem_kho_write_map(fdt, &reserved_mem_table[i]);
+	err |= fdt_end_node(fdt);
+
+	return err ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+static struct notifier_block reserve_mem_kho_nb = {
+	.notifier_call = reserve_mem_kho_notifier,
+};
+
+static int __init reserve_mem_init(void)
+{
+	register_kho_notifier(&reserve_mem_kho_nb);
+
+	return 0;
+}
+core_initcall(reserve_mem_init);
+
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
 static const char * const flagname[] = {
 	[ilog2(MEMBLOCK_HOTPLUG)] = "HOTPLUG",