diff mbox series

[v5,09/16] kexec: enable KHO support for memory preservation

Message ID 20250320015551.2157511-10-changyuanl@google.com (mailing list archive)
State New
Headers show
Series kexec: introduce Kexec HandOver (KHO) | expand

Commit Message

Changyuan Lyu March 20, 2025, 1:55 a.m. UTC
From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>

Introduce APIs allowing KHO users to preserve memory across kexec and
get access to that memory after boot of the kexeced kernel

kho_preserve_folio() - record a folio to be preserved over kexec
kho_restore_folio() - recreates the folio from the preserved memory
kho_preserve_phys() - record physically contiguous range to be
preserved over kexec.
kho_restore_phys() - recreates order-0 pages corresponding to the
preserved physical range

The memory preservations are tracked by two levels of xarrays to manage
chunks of per-order 512 byte bitmaps. For instance the entire 1G order
of a 1TB x86 system would fit inside a single 512 byte bitmap. For
order 0 allocations each bitmap will cover 16M of address space. Thus,
for 16G of memory at most 512K of bitmap memory will be needed for order 0.

At serialization time all bitmaps are recorded in a linked list of pages
for the next kernel to process and the physical address of the list is
recorded in KHO FDT.

The next kernel then processes that list, reserves the memory ranges and
later, when a user requests a folio or a physical range, KHO restores
corresponding memory map entries.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
Co-developed-by: Changyuan Lyu <changyuanl@google.com>
Signed-off-by: Changyuan Lyu <changyuanl@google.com>
---
 include/linux/kexec_handover.h |  38 +++
 kernel/kexec_handover.c        | 486 ++++++++++++++++++++++++++++++++-
 2 files changed, 522 insertions(+), 2 deletions(-)

Comments

Jason Gunthorpe March 21, 2025, 1:46 p.m. UTC | #1
On Wed, Mar 19, 2025 at 06:55:44PM -0700, Changyuan Lyu wrote:
> +/**
> + * kho_preserve_folio - preserve a folio across KHO.
> + * @folio: folio to preserve
> + *
> + * Records that the entire folio is preserved across KHO. The order
> + * will be preserved as well.
> + *
> + * Return: 0 on success, error code on failure
> + */
> +int kho_preserve_folio(struct folio *folio)
> +{
> +	unsigned long pfn = folio_pfn(folio);
> +	unsigned int order = folio_order(folio);
> +	int err;
> +
> +	if (!kho_enable)
> +		return -EOPNOTSUPP;
> +
> +	down_read(&kho_out.tree_lock);
> +	if (kho_out.fdt) {

What is the lock and fdt test for?

I'm getting the feeling that probably kho_preserve_folio() and the
like should accept some kind of 
'struct kho_serialization *' and then we don't need this to prove we
are within a valid serialization window. It could pass the pointer
through the notifiers

The global variables in this series are sort of ugly..

We want this to be fast, so try hard to avoid a lock..

> +void *kho_restore_phys(phys_addr_t phys, size_t size)
> +{
> +	unsigned long start_pfn, end_pfn, pfn;
> +	void *va = __va(phys);
> +
> +	start_pfn = PFN_DOWN(phys);
> +	end_pfn = PFN_UP(phys + size);
> +
> +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> +		struct page *page = pfn_to_online_page(pfn);
> +
> +		if (!page)
> +			return NULL;
> +		kho_restore_page(page);
> +	}
> +
> +	return va;
> +}
> +EXPORT_SYMBOL_GPL(kho_restore_phys);

What do you imagine this is used for? I'm not sure what value there is
in returning a void *? How does the caller "free" this?


> +#define KHOSER_PTR(type)          \
> +	union {                   \
> +		phys_addr_t phys; \
> +		type ptr;         \
> +	}
> +#define KHOSER_STORE_PTR(dest, val)                 \
> +	({                                          \
> +		(dest).phys = virt_to_phys(val);    \
> +		typecheck(typeof((dest).ptr), val); \
> +	})
> +#define KHOSER_LOAD_PTR(src) \
> +	((src).phys ? (typeof((src).ptr))(phys_to_virt((src).phys)) : NULL)

I had imagined these macros would be in a header and usably by drivers
that also want to use structs to carry information.

> +static void deserialize_bitmap(unsigned int order,
> +			       struct khoser_mem_bitmap_ptr *elm)
> +{
> +	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
> +	unsigned long bit;
> +
> +	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
> +		int sz = 1 << (order + PAGE_SHIFT);
> +		phys_addr_t phys =
> +			elm->phys_start + (bit << (order + PAGE_SHIFT));
> +		struct page *page = phys_to_page(phys);
> +
> +		memblock_reserve(phys, sz);
> +		memblock_reserved_mark_noinit(phys, sz);

Mike asked about this earlier, is it work combining runs of set bits
to increase sz? Or is this sort of temporary pending something better
that doesn't rely on memblock_reserve?

> +		page->private = order;

Can't just set the page order directly? Why use private?

> @@ -829,6 +1305,10 @@ static __init int kho_init(void)
>  
>  	kho_out.root.name = "";

?

>  	err = kho_add_string_prop(&kho_out.root, "compatible", "kho-v1");
> +	err |= kho_add_prop(&kho_out.preserved_memory, "metadata",
> +			    &kho_out.first_chunk_phys, sizeof(phys_addr_t));

metedata doesn't fee like a great a better name..

Please also document all the FDT schema thoroughly!

There should be yaml files just like in the normal DT case defining
all of this. This level of documentation and stability was one of the
selling reasons why FDT is being used here!

Jason
Mike Rapoport March 22, 2025, 7:12 p.m. UTC | #2
On Fri, Mar 21, 2025 at 10:46:29AM -0300, Jason Gunthorpe wrote:
> On Wed, Mar 19, 2025 at 06:55:44PM -0700, Changyuan Lyu wrote:
> >
> > +static void deserialize_bitmap(unsigned int order,
> > +			       struct khoser_mem_bitmap_ptr *elm)
> > +{
> > +	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
> > +	unsigned long bit;
> > +
> > +	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
> > +		int sz = 1 << (order + PAGE_SHIFT);
> > +		phys_addr_t phys =
> > +			elm->phys_start + (bit << (order + PAGE_SHIFT));
> > +		struct page *page = phys_to_page(phys);
> > +
> > +		memblock_reserve(phys, sz);
> > +		memblock_reserved_mark_noinit(phys, sz);
> 
> Mike asked about this earlier, is it work combining runs of set bits
> to increase sz? Or is this sort of temporary pending something better
> that doesn't rely on memblock_reserve?

This hunk actually came from me. I decided to keep it simple for now and
check what are the alternatives, like moving away from memblock_reserve(),
adding a maple_tree or even something else.

> > +		page->private = order;
> 
> Can't just set the page order directly? Why use private?

Setting the order means recreating the folio the way prep_compound_page()
does. I think it's better to postpone it until the folio is requested. This
way it might run after SMP is enabled. Besides, when we start allocating
folios separately from struct page, initializing it here would be a real
issue.
 
> Jason
Jason Gunthorpe March 23, 2025, 6:55 p.m. UTC | #3
On Sat, Mar 22, 2025 at 03:12:26PM -0400, Mike Rapoport wrote:
> This hunk actually came from me. I decided to keep it simple for now and
> check what are the alternatives, like moving away from memblock_reserve(),
> adding a maple_tree or even something else.

Okat, makes sense to me
 
> > > +		page->private = order;
> > 
> > Can't just set the page order directly? Why use private?
> 
> Setting the order means recreating the folio the way prep_compound_page()
> does. I think it's better to postpone it until the folio is requested. This
> way it might run after SMP is enabled. 

I see, that makes sense, but also it could stil use page->order..

> Besides, when we start allocating
> folios separately from struct page, initializing it here would be a real
> issue.

Yes, but also we wouldn't have page->private to make it work.. Somehow
anything we want to carry over would have to become encoded in the
memdesc directly.

I think this supports my remark someplace else that any user of this
that wants to preserve per-page data should do it on its own somehow
as an add-on-top?

Jason
Changyuan Lyu March 23, 2025, 7:07 p.m. UTC | #4
On Fri, Mar 21, 2025 at 10:46:29 -0300, Jason Gunthorpe <jgg@nvidia.com> wrote:
> On Wed, Mar 19, 2025 at 06:55:44PM -0700, Changyuan Lyu wrote:
> > +/**
> > + * kho_preserve_folio - preserve a folio across KHO.
> > + * @folio: folio to preserve
> > + *
> > + * Records that the entire folio is preserved across KHO. The order
> > + * will be preserved as well.
> > + *
> > + * Return: 0 on success, error code on failure
> > + */
> > +int kho_preserve_folio(struct folio *folio)
> > +{
> > +	unsigned long pfn = folio_pfn(folio);
> > +	unsigned int order = folio_order(folio);
> > +	int err;
> > +
> > +	if (!kho_enable)
> > +		return -EOPNOTSUPP;
> > +
> > +	down_read(&kho_out.tree_lock);
> > +	if (kho_out.fdt) {
>
> What is the lock and fdt test for?

It is to avoid the competition between the following 2 operations,
- converting the hashtables and mem traker to FDT,
- adding new data to hashtable/mem tracker.
Please also see function kho_finalize() in the previous patch
"kexec: add Kexec HandOver (KHO) generation helpers" [1].

The function kho_finalize() iterates over all the hashtables and
the mem tracker. We want to make sure that during the iterations,
no new data is added to the hashtables and mem tracker.

Also if FDT is generated, the mem tracker then has been serialized
to linked pages, so we return -EBUSY to prevent more data from
being added to the mem tracker.

> I'm getting the feeling that probably kho_preserve_folio() and the
> like should accept some kind of
> 'struct kho_serialization *' and then we don't need this to prove we
> are within a valid serialization window. It could pass the pointer
> through the notifiers

If we use notifiers, callbacks have to be done serially.

> The global variables in this series are sort of ugly..
>
> We want this to be fast, so try hard to avoid a lock..

In most cases we only need read lock. Different KHO users can adding
data into their own subnodes in parallel.
We only need a write lock if
- 2 KHO users register subnodes to the KHO root node at the same time
- KHO root tree is about to be converted to FDT.

> > +void *kho_restore_phys(phys_addr_t phys, size_t size)
> > +{
> > +	unsigned long start_pfn, end_pfn, pfn;
> > +	void *va = __va(phys);
> > +
> > +	start_pfn = PFN_DOWN(phys);
> > +	end_pfn = PFN_UP(phys + size);
> > +
> > +	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
> > +		struct page *page = pfn_to_online_page(pfn);
> > +
> > +		if (!page)
> > +			return NULL;
> > +		kho_restore_page(page);
> > +	}
> > +
> > +	return va;
> > +}
> > +EXPORT_SYMBOL_GPL(kho_restore_phys);
>
> What do you imagine this is used for? I'm not sure what value there is
> in returning a void *? How does the caller "free" this?

This function is also from Mike :)

I suppose some KHO users may still
preserve memory using memory ranges (instead of folio). In the restoring
stage they need a helper to setup the pages of reserved memory ranges.
A void * is returned so the KHO user can access the memory
contents through the virtual address.
I guess the caller can free the ranges by free_pages()?

It makes sense to return nothing and let caller to call `__va`
if they want. Then the function signature looks more symmetric to
`kho_preserve_phys`.

> > +#define KHOSER_PTR(type)          \
> > +	union {                   \
> > +		phys_addr_t phys; \
> > +		type ptr;         \
> > +	}
> > +#define KHOSER_STORE_PTR(dest, val)                 \
> > +	({                                          \
> > +		(dest).phys = virt_to_phys(val);    \
> > +		typecheck(typeof((dest).ptr), val); \
> > +	})
> > +#define KHOSER_LOAD_PTR(src) \
> > +	((src).phys ? (typeof((src).ptr))(phys_to_virt((src).phys)) : NULL)
>
> I had imagined these macros would be in a header and usably by drivers
> that also want to use structs to carry information.
>

OK I will move them to the header file in the next version.

> > [...]
> > @@ -829,6 +1305,10 @@ static __init int kho_init(void)
> >
> >  	kho_out.root.name = "";
>
> ?

Set the root node name to an empty string since fdt_begin_node
calls strlen on the node name.

It is equivalent to `err = fdt_begin_node(fdt, "")` in kho_serialize()
of Mike's V4 patch [2].

> >  	err = kho_add_string_prop(&kho_out.root, "compatible", "kho-v1");
> > +	err |= kho_add_prop(&kho_out.preserved_memory, "metadata",
> > +			    &kho_out.first_chunk_phys, sizeof(phys_addr_t));
>
> metedata doesn't fee like a great a better name..
>
> Please also document all the FDT schema thoroughly!
>
> There should be yaml files just like in the normal DT case defining
> all of this. This level of documentation and stability was one of the
> selling reasons why FDT is being used here!

YAML files were dropped because we think it may take a while for our
schema to be near stable. So we start from some simple plain text. We
can add some prop and node docs (that are considered stable at this point)
back to YAML in the next version.

[1] https://lore.kernel.org/all/20250320015551.2157511-8-changyuanl@google.com/
[2] https://lore.kernel.org/all/20250206132754.2596694-6-rppt@kernel.org/

Best,
Changyuan
Mike Rapoport March 24, 2025, 6:18 p.m. UTC | #5
On Sun, Mar 23, 2025 at 03:55:52PM -0300, Jason Gunthorpe wrote:
> On Sat, Mar 22, 2025 at 03:12:26PM -0400, Mike Rapoport wrote:
>  
> > > > +		page->private = order;
> > > 
> > > Can't just set the page order directly? Why use private?
> > 
> > Setting the order means recreating the folio the way prep_compound_page()
> > does. I think it's better to postpone it until the folio is requested. This
> > way it might run after SMP is enabled. 
> 
> I see, that makes sense, but also it could stil use page->order..

But there's no page->order :)
 
> > Besides, when we start allocating
> > folios separately from struct page, initializing it here would be a real
> > issue.
> 
> Yes, but also we wouldn't have page->private to make it work.. Somehow
> anything we want to carry over would have to become encoded in the
> memdesc directly.

This is a problem to solve in 2026 :)

The January update for State of Page [1] talks about 

	reasonable goal to shrink struct page to (approximately): 

	struct page {
	    unsigned long flags;
	    union {
	        struct list_head buddy_list;
	        struct list_head pcp_list;
	        struct {
	            unsigned long memdesc;
	            int _refcount;
	        };
	    };
	    union {
	        unsigned long private;
	        struct {
	            int _folio_mapcount;
	        };
	    };
	};
 
[1] https://lore.kernel.org/linux-mm/Z37pxbkHPbLYnDKn@casper.infradead.org/
 
> Jason
Jason Gunthorpe March 24, 2025, 8:07 p.m. UTC | #6
On Mon, Mar 24, 2025 at 02:18:34PM -0400, Mike Rapoport wrote:
> On Sun, Mar 23, 2025 at 03:55:52PM -0300, Jason Gunthorpe wrote:
> > On Sat, Mar 22, 2025 at 03:12:26PM -0400, Mike Rapoport wrote:
> >  
> > > > > +		page->private = order;
> > > > 
> > > > Can't just set the page order directly? Why use private?
> > > 
> > > Setting the order means recreating the folio the way prep_compound_page()
> > > does. I think it's better to postpone it until the folio is requested. This
> > > way it might run after SMP is enabled. 
> > 
> > I see, that makes sense, but also it could stil use page->order..
> 
> But there's no page->order :)

I mean this:

static inline unsigned int folio_order(const struct folio *folio)
{
        if (!folio_test_large(folio))
                return 0;
        return folio->_flags_1 & 0xff;
}
 
> > Yes, but also we wouldn't have page->private to make it work.. Somehow
> > anything we want to carry over would have to become encoded in the
> > memdesc directly.
> 
> This is a problem to solve in 2026 :)

Yes :)

Jason
Jason Gunthorpe March 25, 2025, 2:04 a.m. UTC | #7
On Sun, Mar 23, 2025 at 12:07:58PM -0700, Changyuan Lyu wrote:

> > > +	down_read(&kho_out.tree_lock);
> > > +	if (kho_out.fdt) {
> >
> > What is the lock and fdt test for?
> 
> It is to avoid the competition between the following 2 operations,
> - converting the hashtables and mem traker to FDT,
> - adding new data to hashtable/mem tracker.

I think you should strive to prevent this by code construction at a
higher level.

Do not lock each preserve but lock entire object serializations, operations.

For instance if we do recursive FDT then you'd lock the call that
builds a single FDT page for a single object.

> In most cases we only need read lock. Different KHO users can adding
> data into their own subnodes in parallel.

read locks like this are still quite slow in parallel systems, there
is alot of slow cacheline bouncing as taking a read lock still has to
write to the lock memory.

> > What do you imagine this is used for? I'm not sure what value there is
> > in returning a void *? How does the caller "free" this?
> 
> This function is also from Mike :)
> 
> I suppose some KHO users may still
> preserve memory using memory ranges (instead of folio).

I don't know what that would be, but the folio scheme is all about
preserving memory from the page buddy allocator, I don't know what
this is for or how it would be used.

IMHO split this to its own patch and include it in the series that
would use it.

> I guess the caller can free the ranges by free_pages()?

The folios were not setup right, so no.. And if this is the case then
you'd just get the struct page and convert it to a void * with some
helper function, not implement a whole new function...

> > There should be yaml files just like in the normal DT case defining
> > all of this. This level of documentation and stability was one of the
> > selling reasons why FDT is being used here!
> 
> YAML files were dropped because we think it may take a while for our
> schema to be near stable. So we start from some simple plain text. We
> can add some prop and node docs (that are considered stable at this point)
> back to YAML in the next version.

You need to do something to document what is going on here and show
the full schema with some explanation. It is hard to grasp the full
intention just from the C code.

Jason
Mike Rapoport March 26, 2025, 12:07 p.m. UTC | #8
On Mon, Mar 24, 2025 at 05:07:36PM -0300, Jason Gunthorpe wrote:
> On Mon, Mar 24, 2025 at 02:18:34PM -0400, Mike Rapoport wrote:
> > On Sun, Mar 23, 2025 at 03:55:52PM -0300, Jason Gunthorpe wrote:
> > > On Sat, Mar 22, 2025 at 03:12:26PM -0400, Mike Rapoport wrote:
> > >  
> > > > > > +		page->private = order;
> > > > > 
> > > > > Can't just set the page order directly? Why use private?
> > > > 
> > > > Setting the order means recreating the folio the way prep_compound_page()
> > > > does. I think it's better to postpone it until the folio is requested. This
> > > > way it might run after SMP is enabled. 
> > > 
> > > I see, that makes sense, but also it could stil use page->order..
> > 
> > But there's no page->order :)
> 
> I mean this:
> 
> static inline unsigned int folio_order(const struct folio *folio)
> {
>         if (!folio_test_large(folio))
>                 return 0;
>         return folio->_flags_1 & 0xff;
> }

I don't think it's better than page->private, KHO will need to
prep_compound_page() anyway so these will be overwritten there.
And I don't remember, but having those set before prep_compound_page()
might trigger VM_BUG_ON_PGFLAGS().
  
> Jason
Pratyush Yadav March 27, 2025, 10:03 a.m. UTC | #9
Hi Changyuan,

On Wed, Mar 19 2025, Changyuan Lyu wrote:

> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> Introduce APIs allowing KHO users to preserve memory across kexec and
> get access to that memory after boot of the kexeced kernel
>
> kho_preserve_folio() - record a folio to be preserved over kexec
> kho_restore_folio() - recreates the folio from the preserved memory
> kho_preserve_phys() - record physically contiguous range to be
> preserved over kexec.
> kho_restore_phys() - recreates order-0 pages corresponding to the
> preserved physical range
>
> The memory preservations are tracked by two levels of xarrays to manage
> chunks of per-order 512 byte bitmaps. For instance the entire 1G order
> of a 1TB x86 system would fit inside a single 512 byte bitmap. For
> order 0 allocations each bitmap will cover 16M of address space. Thus,
> for 16G of memory at most 512K of bitmap memory will be needed for order 0.
>
> At serialization time all bitmaps are recorded in a linked list of pages
> for the next kernel to process and the physical address of the list is
> recorded in KHO FDT.

Why build the xarray only to transform it down to bitmaps when you can
build the bitmaps from the get go? This would end up wasting both time
and memory. At least from this patch, I don't really see much else being
done with the xarray apart from setting bits in the bitmap.

Of course, with the current linked list structure, this cannot work. But
I don't see why we need to have it. I think having a page-table like
structure would be better -- only instead of having PTEs at the lowest
levels, you have the bitmap.

Just like page tables, each table is page-size. So each page at the
lowest level can have 4k * 8 == 32768 bits. This maps to 128 MiB of 4k
pages. The next level will be pointers to the level 1 table, just like
in page tables. So we get 4096 / 8 == 512 pointers. Each level 2 table
maps to 64 GiB of memory. Similarly, level 3 table maps to 32 TiB and
level 4 to 16 PiB.

Now, __kho_preserve() can just find or allocate the table entry for the
PFN and set its bit. Similar work has to be done when doing the xarray
access as well, so this should have roughly the same performance. When
doing KHO, we just need to record the base address of the table and we
are done. This saves us from doing the expensive copying/transformation
of data in the critical path.

I don't see any obvious downsides compared to the current format. The
serialized state might end up taking slightly more memory due to upper
level tables, but it should still be much less than having two
representations of the same information exist simultaneously.

>
> The next kernel then processes that list, reserves the memory ranges and
> later, when a user requests a folio or a physical range, KHO restores
> corresponding memory map entries.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Co-developed-by: Changyuan Lyu <changyuanl@google.com>
> Signed-off-by: Changyuan Lyu <changyuanl@google.com>
[...]
> +static void deserialize_bitmap(unsigned int order,
> +			       struct khoser_mem_bitmap_ptr *elm)
> +{
> +	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
> +	unsigned long bit;
> +
> +	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
> +		int sz = 1 << (order + PAGE_SHIFT);
> +		phys_addr_t phys =
> +			elm->phys_start + (bit << (order + PAGE_SHIFT));
> +		struct page *page = phys_to_page(phys);
> +
> +		memblock_reserve(phys, sz);
> +		memblock_reserved_mark_noinit(phys, sz);

Why waste time and memory building the reserved ranges? We already have
all the information in the serialized bitmaps, and memblock is already
only allocating from scratch. So we should not need this at all, and
instead simply skip these pages in memblock_free_pages(). With the
page-table like format I mentioned above, this should be very easy since
you can find out whether a page is reserved or not in O(1) time.

> +		page->private = order;
> +	}
> +}
> +
> +static void __init kho_mem_deserialize(void)
> +{
> +	struct khoser_mem_chunk *chunk;
> +	struct kho_in_node preserved_mem;
> +	const phys_addr_t *mem;
> +	int err;
> +	u32 len;
> +
> +	err = kho_get_node(NULL, "preserved-memory", &preserved_mem);
> +	if (err) {
> +		pr_err("no preserved-memory node: %d\n", err);
> +		return;
> +	}
> +
> +	mem = kho_get_prop(&preserved_mem, "metadata", &len);
> +	if (!mem || len != sizeof(*mem)) {
> +		pr_err("failed to get preserved memory bitmaps\n");
> +		return;
> +	}
> +
> +	chunk = *mem ? phys_to_virt(*mem) : NULL;
> +	while (chunk) {
> +		unsigned int i;
> +
> +		memblock_reserve(virt_to_phys(chunk), sizeof(*chunk));
> +
> +		for (i = 0; i != chunk->hdr.num_elms; i++)
> +			deserialize_bitmap(chunk->hdr.order,
> +					   &chunk->bitmaps[i]);
> +		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
> +	}
> +}
> +
>  /* Helper functions for KHO state tree */
>  
>  struct kho_prop {
[...]
Jason Gunthorpe March 27, 2025, 1:31 p.m. UTC | #10
On Thu, Mar 27, 2025 at 10:03:17AM +0000, Pratyush Yadav wrote:

> Of course, with the current linked list structure, this cannot work. But
> I don't see why we need to have it. I think having a page-table like
> structure would be better -- only instead of having PTEs at the lowest
> levels, you have the bitmap.

Yes, but there is a trade off here of what I could write in 30 mins
and what is maximally possible :) The xarray is providing a page table
implementation in a library form.

I think this whole thing can be optimized, especially the
memblock_reserve side, but the idea here is to get started and once we
have some data on what the actual preservation workload is then
someone can optimize this.

Otherwise we are going to be spending months just polishing this one
patch without any actual data on where the performance issues and hot
spots actually are.

Jason
Pratyush Yadav March 27, 2025, 5:28 p.m. UTC | #11
On Thu, Mar 27 2025, Jason Gunthorpe wrote:

> On Thu, Mar 27, 2025 at 10:03:17AM +0000, Pratyush Yadav wrote:
>
>> Of course, with the current linked list structure, this cannot work. But
>> I don't see why we need to have it. I think having a page-table like
>> structure would be better -- only instead of having PTEs at the lowest
>> levels, you have the bitmap.
>
> Yes, but there is a trade off here of what I could write in 30 mins
> and what is maximally possible :) The xarray is providing a page table
> implementation in a library form.
>
> I think this whole thing can be optimized, especially the
> memblock_reserve side, but the idea here is to get started and once we
> have some data on what the actual preservation workload is then
> someone can optimize this.
>
> Otherwise we are going to be spending months just polishing this one
> patch without any actual data on where the performance issues and hot
> spots actually are.

The memblock_reserve side we can optimize later, I agree. But the memory
preservation format is ABI and I think that is worth spending a little
more time on. And I don't think it should be that much more complex than
the current format.

I want to hack around with it, so I'll give it a try over the next few
days and see what I can come up with.
Jason Gunthorpe March 28, 2025, 12:53 p.m. UTC | #12
On Thu, Mar 27, 2025 at 05:28:40PM +0000, Pratyush Yadav wrote:
> > Otherwise we are going to be spending months just polishing this one
> > patch without any actual data on where the performance issues and hot
> > spots actually are.
> 
> The memblock_reserve side we can optimize later, I agree. But the memory
> preservation format is ABI 

I think the agreement was that nothing is ABI at this point..

> and I think that is worth spending a little
> more time on. And I don't think it should be that much more complex than
> the current format.

Maybe!

Jason
Changyuan Lyu April 2, 2025, 4:44 p.m. UTC | #13
Hi Pratyush, Thanks for suggestions!

On Thu, Mar 27, 2025 at 17:28:40 +0000, Pratyush Yadav <ptyadav@amazon.de> wrote:
> On Thu, Mar 27 2025, Jason Gunthorpe wrote:
>
> > On Thu, Mar 27, 2025 at 10:03:17AM +0000, Pratyush Yadav wrote:
> >
> >> Of course, with the current linked list structure, this cannot work. But
> >> I don't see why we need to have it. I think having a page-table like
> >> structure would be better -- only instead of having PTEs at the lowest
> >> levels, you have the bitmap.
> >
> > Yes, but there is a trade off here of what I could write in 30 mins
> > and what is maximally possible :) The xarray is providing a page table
> > implementation in a library form.
> >
> > I think this whole thing can be optimized, especially the
> > memblock_reserve side, but the idea here is to get started and once we
> > have some data on what the actual preservation workload is then
> > someone can optimize this.
> >
> > Otherwise we are going to be spending months just polishing this one
> > patch without any actual data on where the performance issues and hot
> > spots actually are.
>
> The memblock_reserve side we can optimize later, I agree. But the memory
> preservation format is ABI and I think that is worth spending a little
> more time on. And I don't think it should be that much more complex than
> the current format.
>
> I want to hack around with it, so I'll give it a try over the next few
> days and see what I can come up with.

I agree with Jason that "nothing is ABI at this
point" and it will take some time for KHO to stabilize.

On the other hand if you have already came up with something working and
simple, we can include it in the next version.

(Sorry for the late reply, I was traveling.)

Best,
Changyuan
Pratyush Yadav April 2, 2025, 4:47 p.m. UTC | #14
Hi,

On Wed, Apr 02 2025, Changyuan Lyu wrote:

> Hi Pratyush, Thanks for suggestions!
>
> On Thu, Mar 27, 2025 at 17:28:40 +0000, Pratyush Yadav <ptyadav@amazon.de> wrote:
>> On Thu, Mar 27 2025, Jason Gunthorpe wrote:
>>
>> > On Thu, Mar 27, 2025 at 10:03:17AM +0000, Pratyush Yadav wrote:
>> >
>> >> Of course, with the current linked list structure, this cannot work. But
>> >> I don't see why we need to have it. I think having a page-table like
>> >> structure would be better -- only instead of having PTEs at the lowest
>> >> levels, you have the bitmap.
>> >
>> > Yes, but there is a trade off here of what I could write in 30 mins
>> > and what is maximally possible :) The xarray is providing a page table
>> > implementation in a library form.
>> >
>> > I think this whole thing can be optimized, especially the
>> > memblock_reserve side, but the idea here is to get started and once we
>> > have some data on what the actual preservation workload is then
>> > someone can optimize this.
>> >
>> > Otherwise we are going to be spending months just polishing this one
>> > patch without any actual data on where the performance issues and hot
>> > spots actually are.
>>
>> The memblock_reserve side we can optimize later, I agree. But the memory
>> preservation format is ABI and I think that is worth spending a little
>> more time on. And I don't think it should be that much more complex than
>> the current format.
>>
>> I want to hack around with it, so I'll give it a try over the next few
>> days and see what I can come up with.
>
> I agree with Jason that "nothing is ABI at this
> point" and it will take some time for KHO to stabilize.
>
> On the other hand if you have already came up with something working and
> simple, we can include it in the next version.

I already have something that works with zero-order pages. I am
currently implementing support for other orders. It is almost done, but
I need to test it and do a performance comparison with the current
patch. Will post something soon!
Pasha Tatashin April 2, 2025, 6:37 p.m. UTC | #15
On Wed, Apr 2, 2025 at 12:47 PM Pratyush Yadav <ptyadav@amazon.de> wrote:
>
> Hi,
>
> On Wed, Apr 02 2025, Changyuan Lyu wrote:
>
> > Hi Pratyush, Thanks for suggestions!
> >
> > On Thu, Mar 27, 2025 at 17:28:40 +0000, Pratyush Yadav <ptyadav@amazon.de> wrote:
> >> On Thu, Mar 27 2025, Jason Gunthorpe wrote:
> >>
> >> > On Thu, Mar 27, 2025 at 10:03:17AM +0000, Pratyush Yadav wrote:
> >> >
> >> >> Of course, with the current linked list structure, this cannot work. But
> >> >> I don't see why we need to have it. I think having a page-table like
> >> >> structure would be better -- only instead of having PTEs at the lowest
> >> >> levels, you have the bitmap.
> >> >
> >> > Yes, but there is a trade off here of what I could write in 30 mins
> >> > and what is maximally possible :) The xarray is providing a page table
> >> > implementation in a library form.
> >> >
> >> > I think this whole thing can be optimized, especially the
> >> > memblock_reserve side, but the idea here is to get started and once we
> >> > have some data on what the actual preservation workload is then
> >> > someone can optimize this.
> >> >
> >> > Otherwise we are going to be spending months just polishing this one
> >> > patch without any actual data on where the performance issues and hot
> >> > spots actually are.
> >>
> >> The memblock_reserve side we can optimize later, I agree. But the memory
> >> preservation format is ABI and I think that is worth spending a little
> >> more time on. And I don't think it should be that much more complex than
> >> the current format.
> >>
> >> I want to hack around with it, so I'll give it a try over the next few
> >> days and see what I can come up with.
> >
> > I agree with Jason that "nothing is ABI at this
> > point" and it will take some time for KHO to stabilize.
> >
> > On the other hand if you have already came up with something working and
> > simple, we can include it in the next version.
>
> I already have something that works with zero-order pages. I am
> currently implementing support for other orders. It is almost done, but
> I need to test it and do a performance comparison with the current
> patch. Will post something soon!

Hi Pratyush,

Just to clarify, how soon? We are about to post v6 for KHO, with all
other comments in this thread addressed.

Thanks,
Pasha

>
> --
> Regards,
> Pratyush Yadav
Pratyush Yadav April 2, 2025, 6:49 p.m. UTC | #16
On Wed, Apr 02 2025, Pasha Tatashin wrote:

> On Wed, Apr 2, 2025 at 12:47 PM Pratyush Yadav <ptyadav@amazon.de> wrote:
>>
>> Hi,
>>
>> On Wed, Apr 02 2025, Changyuan Lyu wrote:
>>
>> > Hi Pratyush, Thanks for suggestions!
>> >
>> > On Thu, Mar 27, 2025 at 17:28:40 +0000, Pratyush Yadav <ptyadav@amazon.de> wrote:
[...]
>> >>
>> >> The memblock_reserve side we can optimize later, I agree. But the memory
>> >> preservation format is ABI and I think that is worth spending a little
>> >> more time on. And I don't think it should be that much more complex than
>> >> the current format.
>> >>
>> >> I want to hack around with it, so I'll give it a try over the next few
>> >> days and see what I can come up with.
>> >
>> > I agree with Jason that "nothing is ABI at this
>> > point" and it will take some time for KHO to stabilize.
>> >
>> > On the other hand if you have already came up with something working and
>> > simple, we can include it in the next version.
>>
>> I already have something that works with zero-order pages. I am
>> currently implementing support for other orders. It is almost done, but
>> I need to test it and do a performance comparison with the current
>> patch. Will post something soon!
>
> Hi Pratyush,
>
> Just to clarify, how soon? We are about to post v6 for KHO, with all
> other comments in this thread addressed.

I have it working, but I need to clean up the code a bit and test it
better. So hopefully end of this week or early next week.
Pratyush Yadav April 2, 2025, 7:16 p.m. UTC | #17
Hi Changyuan,

On Wed, Mar 19 2025, Changyuan Lyu wrote:

> From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
>
> Introduce APIs allowing KHO users to preserve memory across kexec and
> get access to that memory after boot of the kexeced kernel
>
> kho_preserve_folio() - record a folio to be preserved over kexec
> kho_restore_folio() - recreates the folio from the preserved memory
> kho_preserve_phys() - record physically contiguous range to be
> preserved over kexec.
> kho_restore_phys() - recreates order-0 pages corresponding to the
> preserved physical range
>
> The memory preservations are tracked by two levels of xarrays to manage
> chunks of per-order 512 byte bitmaps. For instance the entire 1G order
> of a 1TB x86 system would fit inside a single 512 byte bitmap. For
> order 0 allocations each bitmap will cover 16M of address space. Thus,
> for 16G of memory at most 512K of bitmap memory will be needed for order 0.
>
> At serialization time all bitmaps are recorded in a linked list of pages
> for the next kernel to process and the physical address of the list is
> recorded in KHO FDT.
>
> The next kernel then processes that list, reserves the memory ranges and
> later, when a user requests a folio or a physical range, KHO restores
> corresponding memory map entries.
>
> Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
> Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
> Co-developed-by: Changyuan Lyu <changyuanl@google.com>
> Signed-off-by: Changyuan Lyu <changyuanl@google.com>
> ---
>  include/linux/kexec_handover.h |  38 +++
>  kernel/kexec_handover.c        | 486 ++++++++++++++++++++++++++++++++-
>  2 files changed, 522 insertions(+), 2 deletions(-)
[...]
> +int kho_preserve_phys(phys_addr_t phys, size_t size)
> +{
> +	unsigned long pfn = PHYS_PFN(phys), end_pfn = PHYS_PFN(phys + size);
> +	unsigned int order = ilog2(end_pfn - pfn);

This caught my eye when playing around with the code. It does not put
any limit on the order, so it can exceed NR_PAGE_ORDERS. Also, when
initializing the page after KHO, we pass the order directly to
prep_compound_page() without sanity checking it. The next kernel might
not support all the orders the current one supports. Perhaps something
to fix?

> +	unsigned long failed_pfn;
> +	int err = 0;
> +
> +	if (!kho_enable)
> +		return -EOPNOTSUPP;
> +
> +	down_read(&kho_out.tree_lock);
> +	if (kho_out.fdt) {
> +		err = -EBUSY;
> +		goto unlock;
> +	}
> +
> +	for (; pfn < end_pfn;
> +	     pfn += (1 << order), order = ilog2(end_pfn - pfn)) {
> +		err = __kho_preserve(&kho_mem_track, pfn, order);
> +		if (err) {
> +			failed_pfn = pfn;
> +			break;
> +		}
> +	}
[...
> +struct folio *kho_restore_folio(phys_addr_t phys)
> +{
> +	struct page *page = pfn_to_online_page(PHYS_PFN(phys));
> +	unsigned long order = page->private;
> +
> +	if (!page)
> +		return NULL;
> +
> +	order = page->private;
> +	if (order)
> +		prep_compound_page(page, order);
> +	else
> +		kho_restore_page(page);
> +
> +	return page_folio(page);
> +}
[...]
diff mbox series

Patch

diff --git a/include/linux/kexec_handover.h b/include/linux/kexec_handover.h
index c665ff6cd728..d52a7b500f4c 100644
--- a/include/linux/kexec_handover.h
+++ b/include/linux/kexec_handover.h
@@ -5,6 +5,7 @@ 
 #include <linux/types.h>
 #include <linux/hashtable.h>
 #include <linux/notifier.h>
+#include <linux/mm_types.h>
 
 struct kho_scratch {
 	phys_addr_t addr;
@@ -54,6 +55,13 @@  int kho_add_string_prop(struct kho_node *node, const char *key,
 int register_kho_notifier(struct notifier_block *nb);
 int unregister_kho_notifier(struct notifier_block *nb);
 
+int kho_preserve_folio(struct folio *folio);
+int kho_unpreserve_folio(struct folio *folio);
+int kho_preserve_phys(phys_addr_t phys, size_t size);
+int kho_unpreserve_phys(phys_addr_t phys, size_t size);
+struct folio *kho_restore_folio(phys_addr_t phys);
+void *kho_restore_phys(phys_addr_t phys, size_t size);
+
 void kho_memory_init(void);
 
 void kho_populate(phys_addr_t handover_fdt_phys, phys_addr_t scratch_phys,
@@ -118,6 +126,36 @@  static inline int unregister_kho_notifier(struct notifier_block *nb)
 	return -EOPNOTSUPP;
 }
 
+static inline int kho_preserve_folio(struct folio *folio)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int kho_unpreserve_folio(struct folio *folio)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int kho_preserve_phys(phys_addr_t phys, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int kho_unpreserve_phys(phys_addr_t phys, size_t size)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline struct folio *kho_restore_folio(phys_addr_t phys)
+{
+	return NULL;
+}
+
+static inline void *kho_restore_phys(phys_addr_t phys, size_t size)
+{
+	return NULL;
+}
+
 static inline void kho_memory_init(void)
 {
 }
diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c
index 6ebad2f023f9..592563c21369 100644
--- a/kernel/kexec_handover.c
+++ b/kernel/kexec_handover.c
@@ -62,6 +62,13 @@  struct kho_out {
 	struct rw_semaphore tree_lock;
 	struct kho_node root;
 
+	/**
+	 * Physical address of the first struct khoser_mem_chunk containing
+	 * serialized data from struct kho_mem_track.
+	 */
+	phys_addr_t first_chunk_phys;
+	struct kho_node preserved_memory;
+
 	void *fdt;
 	u64 fdt_max;
 };
@@ -70,6 +77,7 @@  static struct kho_out kho_out = {
 	.chain_head = BLOCKING_NOTIFIER_INIT(kho_out.chain_head),
 	.tree_lock = __RWSEM_INITIALIZER(kho_out.tree_lock),
 	.root = KHO_NODE_INIT,
+	.preserved_memory = KHO_NODE_INIT,
 	.fdt_max = 10 * SZ_1M,
 };
 
@@ -237,6 +245,461 @@  int kho_node_check_compatible(const struct kho_in_node *node,
 }
 EXPORT_SYMBOL_GPL(kho_node_check_compatible);
 
+/*
+ * Keep track of memory that is to be preserved across KHO.
+ *
+ * The serializing side uses two levels of xarrays to manage chunks of per-order
+ * 512 byte bitmaps. For instance the entire 1G order of a 1TB system would fit
+ * inside a single 512 byte bitmap. For order 0 allocations each bitmap will
+ * cover 16M of address space. Thus, for 16G of memory at most 512K
+ * of bitmap memory will be needed for order 0.
+ *
+ * This approach is fully incremental, as the serialization progresses folios
+ * can continue be aggregated to the tracker. The final step, immediately prior
+ * to kexec would serialize the xarray information into a linked list for the
+ * successor kernel to parse.
+ */
+
+#define PRESERVE_BITS (512 * 8)
+
+struct kho_mem_phys_bits {
+	DECLARE_BITMAP(preserve, PRESERVE_BITS);
+};
+
+struct kho_mem_phys {
+	/*
+	 * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized
+	 * to order.
+	 */
+	struct xarray phys_bits;
+};
+
+struct kho_mem_track {
+	/* Points to kho_mem_phys, each order gets its own bitmap tree */
+	struct xarray orders;
+};
+
+static struct kho_mem_track kho_mem_track;
+
+static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz)
+{
+	void *elm, *res;
+
+	elm = xa_load(xa, index);
+	if (elm)
+		return elm;
+
+	elm = kzalloc(sz, GFP_KERNEL);
+	if (!elm)
+		return ERR_PTR(-ENOMEM);
+
+	res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL);
+	if (xa_is_err(res))
+		res = ERR_PTR(xa_err(res));
+
+	if (res) {
+		kfree(elm);
+		return res;
+	}
+
+	return elm;
+}
+
+static void __kho_unpreserve(struct kho_mem_track *tracker, unsigned long pfn,
+			     unsigned int order)
+{
+	struct kho_mem_phys_bits *bits;
+	struct kho_mem_phys *physxa;
+	unsigned long pfn_hi = pfn >> order;
+
+	physxa = xa_load(&tracker->orders, order);
+	if (!physxa)
+		return;
+
+	bits = xa_load(&physxa->phys_bits, pfn_hi / PRESERVE_BITS);
+	if (!bits)
+		return;
+
+	clear_bit(pfn_hi % PRESERVE_BITS, bits->preserve);
+}
+
+static int __kho_preserve(struct kho_mem_track *tracker, unsigned long pfn,
+			  unsigned int order)
+{
+	struct kho_mem_phys_bits *bits;
+	struct kho_mem_phys *physxa;
+	unsigned long pfn_hi = pfn >> order;
+
+	might_sleep();
+
+	physxa = xa_load_or_alloc(&tracker->orders, order, sizeof(*physxa));
+	if (IS_ERR(physxa))
+		return PTR_ERR(physxa);
+
+	bits = xa_load_or_alloc(&physxa->phys_bits, pfn_hi / PRESERVE_BITS,
+				sizeof(*bits));
+	if (IS_ERR(bits))
+		return PTR_ERR(bits);
+
+	set_bit(pfn_hi % PRESERVE_BITS, bits->preserve);
+
+	return 0;
+}
+
+/**
+ * kho_preserve_folio - preserve a folio across KHO.
+ * @folio: folio to preserve
+ *
+ * Records that the entire folio is preserved across KHO. The order
+ * will be preserved as well.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_folio(struct folio *folio)
+{
+	unsigned long pfn = folio_pfn(folio);
+	unsigned int order = folio_order(folio);
+	int err;
+
+	if (!kho_enable)
+		return -EOPNOTSUPP;
+
+	down_read(&kho_out.tree_lock);
+	if (kho_out.fdt) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	err = __kho_preserve(&kho_mem_track, pfn, order);
+
+unlock:
+	up_read(&kho_out.tree_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_folio);
+
+/**
+ * kho_unpreserve_folio - unpreserve a folio
+ * @folio: folio to unpreserve
+ *
+ * Remove the record of a folio previously preserved by kho_preserve_folio().
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_unpreserve_folio(struct folio *folio)
+{
+	unsigned long pfn = folio_pfn(folio);
+	unsigned int order = folio_order(folio);
+	int err = 0;
+
+	down_read(&kho_out.tree_lock);
+	if (kho_out.fdt) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	__kho_unpreserve(&kho_mem_track, pfn, order);
+
+unlock:
+	up_read(&kho_out.tree_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_folio);
+
+/**
+ * kho_preserve_phys - preserve a physically contiguous range across KHO.
+ * @phys: physical address of the range
+ * @size: size of the range
+ *
+ * Records that the entire range from @phys to @phys + @size is preserved
+ * across KHO.
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_preserve_phys(phys_addr_t phys, size_t size)
+{
+	unsigned long pfn = PHYS_PFN(phys), end_pfn = PHYS_PFN(phys + size);
+	unsigned int order = ilog2(end_pfn - pfn);
+	unsigned long failed_pfn;
+	int err = 0;
+
+	if (!kho_enable)
+		return -EOPNOTSUPP;
+
+	down_read(&kho_out.tree_lock);
+	if (kho_out.fdt) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	for (; pfn < end_pfn;
+	     pfn += (1 << order), order = ilog2(end_pfn - pfn)) {
+		err = __kho_preserve(&kho_mem_track, pfn, order);
+		if (err) {
+			failed_pfn = pfn;
+			break;
+		}
+	}
+
+	if (err)
+		for (pfn = PHYS_PFN(phys); pfn < failed_pfn;
+		     pfn += (1 << order), order = ilog2(end_pfn - pfn))
+			__kho_unpreserve(&kho_mem_track, pfn, order);
+
+unlock:
+	up_read(&kho_out.tree_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_preserve_phys);
+
+/**
+ * kho_unpreserve_phys - unpreserve a physically contiguous range
+ * @phys: physical address of the range
+ * @size: size of the range
+ *
+ * Remove the record of a range previously preserved by kho_preserve_phys().
+ *
+ * Return: 0 on success, error code on failure
+ */
+int kho_unpreserve_phys(phys_addr_t phys, size_t size)
+{
+	unsigned long pfn = PHYS_PFN(phys), end_pfn = PHYS_PFN(phys + size);
+	unsigned int order = ilog2(end_pfn - pfn);
+	int err = 0;
+
+	down_read(&kho_out.tree_lock);
+	if (kho_out.fdt) {
+		err = -EBUSY;
+		goto unlock;
+	}
+
+	for (; pfn < end_pfn; pfn += (1 << order), order = ilog2(end_pfn - pfn))
+		__kho_unpreserve(&kho_mem_track, pfn, order);
+
+unlock:
+	up_read(&kho_out.tree_lock);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(kho_unpreserve_phys);
+
+/* almost as free_reserved_page(), just don't free the page */
+static void kho_restore_page(struct page *page)
+{
+	ClearPageReserved(page);
+	init_page_count(page);
+	adjust_managed_page_count(page, 1);
+}
+
+struct folio *kho_restore_folio(phys_addr_t phys)
+{
+	struct page *page = pfn_to_online_page(PHYS_PFN(phys));
+	unsigned long order = page->private;
+
+	if (!page)
+		return NULL;
+
+	order = page->private;
+	if (order)
+		prep_compound_page(page, order);
+	else
+		kho_restore_page(page);
+
+	return page_folio(page);
+}
+EXPORT_SYMBOL_GPL(kho_restore_folio);
+
+void *kho_restore_phys(phys_addr_t phys, size_t size)
+{
+	unsigned long start_pfn, end_pfn, pfn;
+	void *va = __va(phys);
+
+	start_pfn = PFN_DOWN(phys);
+	end_pfn = PFN_UP(phys + size);
+
+	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+		struct page *page = pfn_to_online_page(pfn);
+
+		if (!page)
+			return NULL;
+		kho_restore_page(page);
+	}
+
+	return va;
+}
+EXPORT_SYMBOL_GPL(kho_restore_phys);
+
+#define KHOSER_PTR(type)          \
+	union {                   \
+		phys_addr_t phys; \
+		type ptr;         \
+	}
+#define KHOSER_STORE_PTR(dest, val)                 \
+	({                                          \
+		(dest).phys = virt_to_phys(val);    \
+		typecheck(typeof((dest).ptr), val); \
+	})
+#define KHOSER_LOAD_PTR(src) \
+	((src).phys ? (typeof((src).ptr))(phys_to_virt((src).phys)) : NULL)
+
+struct khoser_mem_bitmap_ptr {
+	phys_addr_t phys_start;
+	KHOSER_PTR(struct kho_mem_phys_bits *) bitmap;
+};
+
+struct khoser_mem_chunk;
+
+struct khoser_mem_chunk_hdr {
+	KHOSER_PTR(struct khoser_mem_chunk *) next;
+	unsigned int order;
+	unsigned int num_elms;
+};
+
+#define KHOSER_BITMAP_SIZE                                   \
+	((PAGE_SIZE - sizeof(struct khoser_mem_chunk_hdr)) / \
+	 sizeof(struct khoser_mem_bitmap_ptr))
+
+struct khoser_mem_chunk {
+	struct khoser_mem_chunk_hdr hdr;
+	struct khoser_mem_bitmap_ptr bitmaps[KHOSER_BITMAP_SIZE];
+};
+static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE);
+
+static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk,
+					  unsigned long order)
+{
+	struct khoser_mem_chunk *chunk;
+
+	chunk = (struct khoser_mem_chunk *)get_zeroed_page(GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+	chunk->hdr.order = order;
+	if (cur_chunk)
+		KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk);
+	return chunk;
+}
+
+static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk)
+{
+	struct khoser_mem_chunk *chunk = first_chunk;
+
+	while (chunk) {
+		unsigned long chunk_page = (unsigned long)chunk;
+
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+		free_page(chunk_page);
+	}
+}
+
+/*
+ * Record all the bitmaps in a linked list of pages for the next kernel to
+ * process. Each chunk holds bitmaps of the same order and each block of bitmaps
+ * starts at a given physical address. This allows the bitmaps to be sparse. The
+ * xarray is used to store them in a tree while building up the data structure,
+ * but the KHO successor kernel only needs to process them once in order.
+ *
+ * All of this memory is normal kmalloc() memory and is not marked for
+ * preservation. The successor kernel will remain isolated to the scratch space
+ * until it completes processing this list. Once processed all the memory
+ * storing these ranges will be marked as free.
+ */
+static struct khoser_mem_chunk *kho_mem_serialize(void)
+{
+	struct kho_mem_track *tracker = &kho_mem_track;
+	struct khoser_mem_chunk *first_chunk = NULL;
+	struct khoser_mem_chunk *chunk = NULL;
+	struct kho_mem_phys *physxa;
+	unsigned long order;
+
+	xa_for_each(&tracker->orders, order, physxa) {
+		struct kho_mem_phys_bits *bits;
+		unsigned long phys;
+
+		chunk = new_chunk(chunk, order);
+		if (!chunk)
+			goto err_free;
+
+		if (!first_chunk)
+			first_chunk = chunk;
+
+		xa_for_each(&physxa->phys_bits, phys, bits) {
+			struct khoser_mem_bitmap_ptr *elm;
+
+			if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) {
+				chunk = new_chunk(chunk, order);
+				if (!chunk)
+					goto err_free;
+			}
+
+			elm = &chunk->bitmaps[chunk->hdr.num_elms];
+			chunk->hdr.num_elms++;
+			elm->phys_start = (phys * PRESERVE_BITS)
+					  << (order + PAGE_SHIFT);
+			KHOSER_STORE_PTR(elm->bitmap, bits);
+		}
+	}
+
+	return first_chunk;
+
+err_free:
+	kho_mem_ser_free(first_chunk);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void deserialize_bitmap(unsigned int order,
+			       struct khoser_mem_bitmap_ptr *elm)
+{
+	struct kho_mem_phys_bits *bitmap = KHOSER_LOAD_PTR(elm->bitmap);
+	unsigned long bit;
+
+	for_each_set_bit(bit, bitmap->preserve, PRESERVE_BITS) {
+		int sz = 1 << (order + PAGE_SHIFT);
+		phys_addr_t phys =
+			elm->phys_start + (bit << (order + PAGE_SHIFT));
+		struct page *page = phys_to_page(phys);
+
+		memblock_reserve(phys, sz);
+		memblock_reserved_mark_noinit(phys, sz);
+		page->private = order;
+	}
+}
+
+static void __init kho_mem_deserialize(void)
+{
+	struct khoser_mem_chunk *chunk;
+	struct kho_in_node preserved_mem;
+	const phys_addr_t *mem;
+	int err;
+	u32 len;
+
+	err = kho_get_node(NULL, "preserved-memory", &preserved_mem);
+	if (err) {
+		pr_err("no preserved-memory node: %d\n", err);
+		return;
+	}
+
+	mem = kho_get_prop(&preserved_mem, "metadata", &len);
+	if (!mem || len != sizeof(*mem)) {
+		pr_err("failed to get preserved memory bitmaps\n");
+		return;
+	}
+
+	chunk = *mem ? phys_to_virt(*mem) : NULL;
+	while (chunk) {
+		unsigned int i;
+
+		memblock_reserve(virt_to_phys(chunk), sizeof(*chunk));
+
+		for (i = 0; i != chunk->hdr.num_elms; i++)
+			deserialize_bitmap(chunk->hdr.order,
+					   &chunk->bitmaps[i]);
+		chunk = KHOSER_LOAD_PTR(chunk->hdr.next);
+	}
+}
+
 /* Helper functions for KHO state tree */
 
 struct kho_prop {
@@ -545,6 +1008,11 @@  static int kho_unfreeze(void)
 	if (fdt)
 		kvfree(fdt);
 
+	if (kho_out.first_chunk_phys) {
+		kho_mem_ser_free(phys_to_virt(kho_out.first_chunk_phys));
+		kho_out.first_chunk_phys = 0;
+	}
+
 	err = blocking_notifier_call_chain(&kho_out.chain_head,
 					   KEXEC_KHO_UNFREEZE, NULL);
 	err = notifier_to_errno(err);
@@ -633,6 +1101,7 @@  static int kho_finalize(void)
 {
 	int err = 0;
 	void *fdt;
+	struct khoser_mem_chunk *first_chunk;
 
 	fdt = kvmalloc(kho_out.fdt_max, GFP_KERNEL);
 	if (!fdt)
@@ -648,6 +1117,13 @@  static int kho_finalize(void)
 	kho_out.fdt = fdt;
 	up_write(&kho_out.tree_lock);
 
+	first_chunk = kho_mem_serialize();
+	if (IS_ERR(first_chunk)) {
+		err = PTR_ERR(first_chunk);
+		goto unfreeze;
+	}
+	kho_out.first_chunk_phys = first_chunk ? virt_to_phys(first_chunk) : 0;
+
 	err = kho_convert_tree(fdt, kho_out.fdt_max);
 
 unfreeze:
@@ -829,6 +1305,10 @@  static __init int kho_init(void)
 
 	kho_out.root.name = "";
 	err = kho_add_string_prop(&kho_out.root, "compatible", "kho-v1");
+	err |= kho_add_prop(&kho_out.preserved_memory, "metadata",
+			    &kho_out.first_chunk_phys, sizeof(phys_addr_t));
+	err |= kho_add_node(&kho_out.root, "preserved-memory",
+			    &kho_out.preserved_memory);
 	if (err)
 		goto err_free_scratch;
 
@@ -1079,10 +1559,12 @@  static void __init kho_release_scratch(void)
 
 void __init kho_memory_init(void)
 {
-	if (!kho_get_fdt())
+	if (!kho_get_fdt()) {
 		kho_reserve_scratch();
-	else
+	} else {
+		kho_mem_deserialize();
 		kho_release_scratch();
+	}
 }
 
 void __init kho_populate(phys_addr_t handover_fdt_phys,