diff mbox series

[1/3] kexec: Prevent removal of memory in use by a loaded kexec image

Message ID 20200326180730.4754-2-james.morse@arm.com (mailing list archive)
State New, archived
Headers show
Series kexec/memory_hotplug: Prevent removal and accidental use | expand

Commit Message

James Morse March 26, 2020, 6:07 p.m. UTC
An image loaded for kexec is not stored in place, instead its segments
are scattered through memory, and are re-assembled when needed. In the
meantime, the target memory may have been removed.

Because mm is not aware that this memory is still in use, it allows it
to be removed.

Add a memory notifier to prevent the removal of memory regions that
overlap with a loaded kexec image segment. e.g., when triggered from the
Qemu console:
| kexec_core: memory region in use
| memory memory32: Offline failed.

Signed-off-by: James Morse <james.morse@arm.com>
---
 kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

Comments

Anshuman Khandual March 27, 2020, 12:43 a.m. UTC | #1
On 03/26/2020 11:37 PM, James Morse wrote:
> An image loaded for kexec is not stored in place, instead its segments
> are scattered through memory, and are re-assembled when needed. In the
> meantime, the target memory may have been removed.
> 
> Because mm is not aware that this memory is still in use, it allows it
> to be removed.

Why the isolation process does not fail when these pages are currently
being used by kexec ?

> 
> Add a memory notifier to prevent the removal of memory regions that
> overlap with a loaded kexec image segment. e.g., when triggered from the
> Qemu console:
> | kexec_core: memory region in use
> | memory memory32: Offline failed.

Yes this is definitely an added protection for these kexec loaded kernels
memory areas from being offlined but I would have expected the preceding
offlining to have failed as well.

> 
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..ba1d91e868ca 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -12,6 +12,7 @@
>  #include <linux/slab.h>
>  #include <linux/fs.h>
>  #include <linux/kexec.h>
> +#include <linux/memory.h>
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/highmem.h>
> @@ -22,10 +23,12 @@
>  #include <linux/elf.h>
>  #include <linux/elfcore.h>
>  #include <linux/utsname.h>
> +#include <linux/notifier.h>
>  #include <linux/numa.h>
>  #include <linux/suspend.h>
>  #include <linux/device.h>
>  #include <linux/freezer.h>
> +#include <linux/pfn.h>
>  #include <linux/pm.h>
>  #include <linux/cpu.h>
>  #include <linux/uaccess.h>
> @@ -1219,3 +1222,56 @@ void __weak arch_kexec_protect_crashkres(void)
>  
>  void __weak arch_kexec_unprotect_crashkres(void)
>  {}
> +
> +/*
> + * If user-space wants to offline memory that is in use by a loaded kexec
> + * image, it should unload the image first.
> + */

Probably this would need kexec user manual and related system call man pages
update as well.

> +static int mem_remove_cb(struct notifier_block *nb, unsigned long action,
> +			 void *data)
> +{
> +	int rv = NOTIFY_OK, i;
> +	struct memory_notify *arg = data;
> +	unsigned long pfn = arg->start_pfn;
> +	unsigned long nr_segments, sstart, send;
> +	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
> +
> +	might_sleep();

Required ?

> +
> +	if (action != MEM_GOING_OFFLINE)
> +		return NOTIFY_DONE;
> +
> +	mutex_lock(&kexec_mutex);
> +	if (kexec_image) {
> +		nr_segments = kexec_image->nr_segments;
> +
> +		for (i = 0; i < nr_segments; i++) {
> +			sstart = PFN_DOWN(kexec_image->segment[i].mem);
> +			send = PFN_UP(kexec_image->segment[i].mem +
> +				      kexec_image->segment[i].memsz);
> +
> +			if ((pfn <= sstart && sstart < end_pfn) ||
> +			    (pfn <= send && send < end_pfn)) {
> +				pr_warn("Memory region in use\n");
> +				rv = NOTIFY_BAD;
> +				break;
> +			}
> +		}
> +	}
> +	mutex_unlock(&kexec_mutex);
> +
> +	return rv;

Variable 'rv' is redundant, should use NOTIFY_[BAD|OK] directly instead.

> +}
> +
> +static struct notifier_block mem_remove_nb = {
> +	.notifier_call = mem_remove_cb,
> +};
> +
> +static int __init register_mem_remove_cb(void)
> +{
> +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))

Should not all these new code here be wrapped with CONFIG_MEMORY_HOTREMOVE
to reduce the scope as well as final code size when the config is disabled.

> +		return register_memory_notifier(&mem_remove_nb);
> +
> +	return 0;
> +}
> +device_initcall(register_mem_remove_cb);
>
Baoquan He March 27, 2020, 2:34 a.m. UTC | #2
On 03/26/20 at 06:07pm, James Morse wrote:
> An image loaded for kexec is not stored in place, instead its segments
> are scattered through memory, and are re-assembled when needed. In the
> meantime, the target memory may have been removed.
> 
> Because mm is not aware that this memory is still in use, it allows it
> to be removed.
> 
> Add a memory notifier to prevent the removal of memory regions that
> overlap with a loaded kexec image segment. e.g., when triggered from the
> Qemu console:
> | kexec_core: memory region in use
> | memory memory32: Offline failed.

As I replied to the cover letter, usually we do loading and juming of
kexec-ed kernel at one time. If we expect to do both of them at
different time, I agree we should do something to make thing safer if
someone really want to do since it's allowed, can we do anything with
the existing notifier?

Mem hotplug has got a notifier to notice it will offline a memory
region.
memory_notify(MEM_OFFLINE, &arg);

> 
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..ba1d91e868ca 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -12,6 +12,7 @@
>  #include <linux/slab.h>
>  #include <linux/fs.h>
>  #include <linux/kexec.h>
> +#include <linux/memory.h>
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/highmem.h>
> @@ -22,10 +23,12 @@
>  #include <linux/elf.h>
>  #include <linux/elfcore.h>
>  #include <linux/utsname.h>
> +#include <linux/notifier.h>
>  #include <linux/numa.h>
>  #include <linux/suspend.h>
>  #include <linux/device.h>
>  #include <linux/freezer.h>
> +#include <linux/pfn.h>
>  #include <linux/pm.h>
>  #include <linux/cpu.h>
>  #include <linux/uaccess.h>
> @@ -1219,3 +1222,56 @@ void __weak arch_kexec_protect_crashkres(void)
>  
>  void __weak arch_kexec_unprotect_crashkres(void)
>  {}
> +
> +/*
> + * If user-space wants to offline memory that is in use by a loaded kexec
> + * image, it should unload the image first.
> + */
> +static int mem_remove_cb(struct notifier_block *nb, unsigned long action,
> +			 void *data)
> +{
> +	int rv = NOTIFY_OK, i;
> +	struct memory_notify *arg = data;
> +	unsigned long pfn = arg->start_pfn;
> +	unsigned long nr_segments, sstart, send;
> +	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
> +
> +	might_sleep();
> +
> +	if (action != MEM_GOING_OFFLINE)
> +		return NOTIFY_DONE;
> +
> +	mutex_lock(&kexec_mutex);
> +	if (kexec_image) {
> +		nr_segments = kexec_image->nr_segments;
> +
> +		for (i = 0; i < nr_segments; i++) {
> +			sstart = PFN_DOWN(kexec_image->segment[i].mem);
> +			send = PFN_UP(kexec_image->segment[i].mem +
> +				      kexec_image->segment[i].memsz);
> +
> +			if ((pfn <= sstart && sstart < end_pfn) ||
> +			    (pfn <= send && send < end_pfn)) {
> +				pr_warn("Memory region in use\n");
> +				rv = NOTIFY_BAD;
> +				break;
> +			}
> +		}
> +	}
> +	mutex_unlock(&kexec_mutex);
> +
> +	return rv;
> +}
> +
> +static struct notifier_block mem_remove_nb = {
> +	.notifier_call = mem_remove_cb,
> +};
> +
> +static int __init register_mem_remove_cb(void)
> +{
> +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
> +		return register_memory_notifier(&mem_remove_nb);
> +
> +	return 0;
> +}
> +device_initcall(register_mem_remove_cb);
> -- 
> 2.25.1
> 
>
Baoquan He March 27, 2020, 2:54 a.m. UTC | #3
On 03/27/20 at 06:13am, Anshuman Khandual wrote:
> 
> 
> On 03/26/2020 11:37 PM, James Morse wrote:
> > An image loaded for kexec is not stored in place, instead its segments
> > are scattered through memory, and are re-assembled when needed. In the
> > meantime, the target memory may have been removed.
> > 
> > Because mm is not aware that this memory is still in use, it allows it
> > to be removed.
> 
> Why the isolation process does not fail when these pages are currently
> being used by kexec ?

That is trick of kexec implementaiton. When loading kexec-ed kernel, it
just reads in the kenrel image in user space, then searches a place
where it's going to put kernel image in the whole system RAM, but won't
put kernel image in the searched region immediately, just book ahead a
room. When you execute 'kexec -e' to trigger kexec jumping, it will copy
kernel image into the booked room. So the booking is only recorded in
kexec's own data structure.

> 
> > 
> > Add a memory notifier to prevent the removal of memory regions that
> > overlap with a loaded kexec image segment. e.g., when triggered from the
> > Qemu console:
> > | kexec_core: memory region in use
> > | memory memory32: Offline failed.
> 
> Yes this is definitely an added protection for these kexec loaded kernels
> memory areas from being offlined but I would have expected the preceding
> offlining to have failed as well.
> 
> > 
> > Signed-off-by: James Morse <james.morse@arm.com>
> > ---
> >  kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 56 insertions(+)
> > 
> > diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> > index c19c0dad1ebe..ba1d91e868ca 100644
> > --- a/kernel/kexec_core.c
> > +++ b/kernel/kexec_core.c
> > @@ -12,6 +12,7 @@
> >  #include <linux/slab.h>
> >  #include <linux/fs.h>
> >  #include <linux/kexec.h>
> > +#include <linux/memory.h>
> >  #include <linux/mutex.h>
> >  #include <linux/list.h>
> >  #include <linux/highmem.h>
> > @@ -22,10 +23,12 @@
> >  #include <linux/elf.h>
> >  #include <linux/elfcore.h>
> >  #include <linux/utsname.h>
> > +#include <linux/notifier.h>
> >  #include <linux/numa.h>
> >  #include <linux/suspend.h>
> >  #include <linux/device.h>
> >  #include <linux/freezer.h>
> > +#include <linux/pfn.h>
> >  #include <linux/pm.h>
> >  #include <linux/cpu.h>
> >  #include <linux/uaccess.h>
> > @@ -1219,3 +1222,56 @@ void __weak arch_kexec_protect_crashkres(void)
> >  
> >  void __weak arch_kexec_unprotect_crashkres(void)
> >  {}
> > +
> > +/*
> > + * If user-space wants to offline memory that is in use by a loaded kexec
> > + * image, it should unload the image first.
> > + */
> 
> Probably this would need kexec user manual and related system call man pages
> update as well.
> 
> > +static int mem_remove_cb(struct notifier_block *nb, unsigned long action,
> > +			 void *data)
> > +{
> > +	int rv = NOTIFY_OK, i;
> > +	struct memory_notify *arg = data;
> > +	unsigned long pfn = arg->start_pfn;
> > +	unsigned long nr_segments, sstart, send;
> > +	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
> > +
> > +	might_sleep();
> 
> Required ?
> 
> > +
> > +	if (action != MEM_GOING_OFFLINE)
> > +		return NOTIFY_DONE;
> > +
> > +	mutex_lock(&kexec_mutex);
> > +	if (kexec_image) {
> > +		nr_segments = kexec_image->nr_segments;
> > +
> > +		for (i = 0; i < nr_segments; i++) {
> > +			sstart = PFN_DOWN(kexec_image->segment[i].mem);
> > +			send = PFN_UP(kexec_image->segment[i].mem +
> > +				      kexec_image->segment[i].memsz);
> > +
> > +			if ((pfn <= sstart && sstart < end_pfn) ||
> > +			    (pfn <= send && send < end_pfn)) {
> > +				pr_warn("Memory region in use\n");
> > +				rv = NOTIFY_BAD;
> > +				break;
> > +			}
> > +		}
> > +	}
> > +	mutex_unlock(&kexec_mutex);
> > +
> > +	return rv;
> 
> Variable 'rv' is redundant, should use NOTIFY_[BAD|OK] directly instead.
> 
> > +}
> > +
> > +static struct notifier_block mem_remove_nb = {
> > +	.notifier_call = mem_remove_cb,
> > +};
> > +
> > +static int __init register_mem_remove_cb(void)
> > +{
> > +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
> 
> Should not all these new code here be wrapped with CONFIG_MEMORY_HOTREMOVE
> to reduce the scope as well as final code size when the config is disabled.
> 
> > +		return register_memory_notifier(&mem_remove_nb);
> > +
> > +	return 0;
> > +}
> > +device_initcall(register_mem_remove_cb);
> > 
>
David Hildenbrand March 27, 2020, 9:30 a.m. UTC | #4
On 26.03.20 19:07, James Morse wrote:
> An image loaded for kexec is not stored in place, instead its segments
> are scattered through memory, and are re-assembled when needed. In the
> meantime, the target memory may have been removed.
> 
> Because mm is not aware that this memory is still in use, it allows it
> to be removed.
> 
> Add a memory notifier to prevent the removal of memory regions that
> overlap with a loaded kexec image segment. e.g., when triggered from the
> Qemu console:
> | kexec_core: memory region in use
> | memory memory32: Offline failed.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
>  kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 56 insertions(+)
> 
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..ba1d91e868ca 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -12,6 +12,7 @@
>  #include <linux/slab.h>
>  #include <linux/fs.h>
>  #include <linux/kexec.h>
> +#include <linux/memory.h>
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/highmem.h>
> @@ -22,10 +23,12 @@
>  #include <linux/elf.h>
>  #include <linux/elfcore.h>
>  #include <linux/utsname.h>
> +#include <linux/notifier.h>
>  #include <linux/numa.h>
>  #include <linux/suspend.h>
>  #include <linux/device.h>
>  #include <linux/freezer.h>
> +#include <linux/pfn.h>
>  #include <linux/pm.h>
>  #include <linux/cpu.h>
>  #include <linux/uaccess.h>
> @@ -1219,3 +1222,56 @@ void __weak arch_kexec_protect_crashkres(void)
>  
>  void __weak arch_kexec_unprotect_crashkres(void)
>  {}
> +
> +/*
> + * If user-space wants to offline memory that is in use by a loaded kexec
> + * image, it should unload the image first.
> + */
> +static int mem_remove_cb(struct notifier_block *nb, unsigned long action,
> +			 void *data)
> +{
> +	int rv = NOTIFY_OK, i;
> +	struct memory_notify *arg = data;
> +	unsigned long pfn = arg->start_pfn;
> +	unsigned long nr_segments, sstart, send;
> +	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
> +
> +	might_sleep();
> +
> +	if (action != MEM_GOING_OFFLINE)
> +		return NOTIFY_DONE;
> +
> +	mutex_lock(&kexec_mutex);
> +	if (kexec_image) {
> +		nr_segments = kexec_image->nr_segments;
> +
> +		for (i = 0; i < nr_segments; i++) {
> +			sstart = PFN_DOWN(kexec_image->segment[i].mem);
> +			send = PFN_UP(kexec_image->segment[i].mem +
> +				      kexec_image->segment[i].memsz);
> +
> +			if ((pfn <= sstart && sstart < end_pfn) ||
> +			    (pfn <= send && send < end_pfn)) {
> +				pr_warn("Memory region in use\n");
> +				rv = NOTIFY_BAD;
> +				break;
> +			}
> +		}
> +	}
> +	mutex_unlock(&kexec_mutex);
> +
> +	return rv;
> +}
> +
> +static struct notifier_block mem_remove_nb = {
> +	.notifier_call = mem_remove_cb,
> +};
> +
> +static int __init register_mem_remove_cb(void)
> +{
> +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
> +		return register_memory_notifier(&mem_remove_nb);
> +
> +	return 0;
> +}
> +device_initcall(register_mem_remove_cb);
> 

E.g., in kernel/kexec_core.c:kimage_alloc_pages()

"SetPageReserved(pages + i);"

Pages that are reserved cannot get offlined. How are you able to trigger
that before this patch? (where is the allocation path for kexec, which
will not set the pages reserved?)
James Morse March 27, 2020, 3:46 p.m. UTC | #5
Hi Anshuman,

On 3/27/20 12:43 AM, Anshuman Khandual wrote:
> On 03/26/2020 11:37 PM, James Morse wrote:
>> An image loaded for kexec is not stored in place, instead its segments
>> are scattered through memory, and are re-assembled when needed. In the
>> meantime, the target memory may have been removed.
>>
>> Because mm is not aware that this memory is still in use, it allows it
>> to be removed.

> Why the isolation process does not fail when these pages are currently
> being used by kexec ?

Kexec isn't using them right now, but it will once kexec is triggered.
Those physical addresses are held in some internal kexec data structures until
kexec is triggered.


>> Add a memory notifier to prevent the removal of memory regions that
>> overlap with a loaded kexec image segment. e.g., when triggered from the
>> Qemu console:
>> | kexec_core: memory region in use
>> | memory memory32: Offline failed.
> 
> Yes this is definitely an added protection for these kexec loaded kernels
> memory areas from being offlined but I would have expected the preceding
> offlining to have failed as well.

kexec hasn't allocate the memory, part of the regions user-space may specify for
the next kernel may be in use. There is nothing to stop the memory being used in
the meantime.


Any other way of doing this would prevent us saying why it failed.
Like this, the user can spot the 'kexec: Memory region in use message', and
unload kexec.


>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index c19c0dad1ebe..ba1d91e868ca 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -1219,3 +1222,56 @@ void __weak arch_kexec_protect_crashkres(void)
>>  
>>  void __weak arch_kexec_unprotect_crashkres(void)
>>  {}
>> +
>> +/*
>> + * If user-space wants to offline memory that is in use by a loaded kexec
>> + * image, it should unload the image first.
>> + */

> Probably this would need kexec user manual and related system call man pages
> update as well.

I can't see anything relevant under Documentation. (kdump yes, kexec no...)


>> +static int mem_remove_cb(struct notifier_block *nb, unsigned long action,
>> +			 void *data)
>> +{
>> +	int rv = NOTIFY_OK, i;
>> +	struct memory_notify *arg = data;
>> +	unsigned long pfn = arg->start_pfn;
>> +	unsigned long nr_segments, sstart, send;
>> +	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
>> +
>> +	might_sleep();
> 
> Required ?

Habit, and I think best practice. We take a mutex, so might_sleep(), but we also
conditionally return before lockdep would see the mutex. Having this annotation
means a dangerous change to the way this is called triggers a warning without
having to test memory hotplug explicitly.


>> +
>> +	if (action != MEM_GOING_OFFLINE)
>> +		return NOTIFY_DONE;
>> +
>> +	mutex_lock(&kexec_mutex);
>> +	if (kexec_image) {
>> +		nr_segments = kexec_image->nr_segments;
>> +
>> +		for (i = 0; i < nr_segments; i++) {
>> +			sstart = PFN_DOWN(kexec_image->segment[i].mem);
>> +			send = PFN_UP(kexec_image->segment[i].mem +
>> +				      kexec_image->segment[i].memsz);
>> +
>> +			if ((pfn <= sstart && sstart < end_pfn) ||
>> +			    (pfn <= send && send < end_pfn)) {
>> +				pr_warn("Memory region in use\n");
>> +				rv = NOTIFY_BAD;
>> +				break;
>> +			}
>> +		}
>> +	}
>> +	mutex_unlock(&kexec_mutex);
>> +
>> +	return rv;
> 
> Variable 'rv' is redundant, should use NOTIFY_[BAD|OK] directly instead.

You'd prefer a mutex_unlock() in the middle of the loop? ... or goto?
(I'm not convinced)

>> +}
>> +
>> +static struct notifier_block mem_remove_nb = {
>> +	.notifier_call = mem_remove_cb,
>> +};
>> +
>> +static int __init register_mem_remove_cb(void)
>> +{
>> +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
> 
> Should not all these new code here be wrapped with CONFIG_MEMORY_HOTREMOVE
> to reduce the scope as well as final code size when the config is disabled.

The compiler is really good at this. "if (false)" means this is all dead code,
and static means its not exported, so the compiler is free to remove it.

Not #ifdef-ing it makes it much more readable, and means the compiler checks its
valid C before it removes it. This avoids weird header include problems that
only show up on some rand-config builds.


Thanks,

James

>> +		return register_memory_notifier(&mem_remove_nb);
>> +
>> +	return 0;
>> +}
>> +device_initcall(register_mem_remove_cb);
>>
>
James Morse March 27, 2020, 4:56 p.m. UTC | #6
Hi David,

On 3/27/20 9:30 AM, David Hildenbrand wrote:
> On 26.03.20 19:07, James Morse wrote:
>> An image loaded for kexec is not stored in place, instead its segments
>> are scattered through memory, and are re-assembled when needed. In the
>> meantime, the target memory may have been removed.
>>
>> Because mm is not aware that this memory is still in use, it allows it
>> to be removed.
>>
>> Add a memory notifier to prevent the removal of memory regions that
>> overlap with a loaded kexec image segment. e.g., when triggered from the
>> Qemu console:
>> | kexec_core: memory region in use
>> | memory memory32: Offline failed.
>>
>> Signed-off-by: James Morse <james.morse@arm.com>
>> ---
>>  kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 56 insertions(+)
>>
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index c19c0dad1ebe..ba1d91e868ca 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c

> E.g., in kernel/kexec_core.c:kimage_alloc_pages()
> 
> "SetPageReserved(pages + i);"
> 
> Pages that are reserved cannot get offlined. How are you able to trigger
> that before this patch? (where is the allocation path for kexec, which
> will not set the pages reserved?)

This sets page reserved on the memory it gets back from
alloc_pages() in kimage_alloc_pages(). This is when you load the image[0].

The problem I see is for the target or destination memory once you execute the
image. Once machine_kexec() runs, it tries to write to this, assuming it is
still present...


How can I make the commit message clearer?

're-assembled' and 'target memory' aren't quite cutting it, is there are a
correct term to use?
(destination?)



Thanks,

James


[0] Just to convince myself:
| kimage_alloc_pages+0x30/0x15c
| kimage_alloc_page+0x210/0x7d8
| kimage_load_segment+0x14c/0x8c8
| __arm64_sys_kexec_load+0x4f0/0x720
| do_el0_svc+0x13c/0x3c0
| el0_sync_handler+0x9c/0x3c0
| el0_sync+0x158/0x180
David Hildenbrand March 27, 2020, 5:06 p.m. UTC | #7
On 27.03.20 17:56, James Morse wrote:
> Hi David,
> 
> On 3/27/20 9:30 AM, David Hildenbrand wrote:
>> On 26.03.20 19:07, James Morse wrote:
>>> An image loaded for kexec is not stored in place, instead its segments
>>> are scattered through memory, and are re-assembled when needed. In the
>>> meantime, the target memory may have been removed.
>>>
>>> Because mm is not aware that this memory is still in use, it allows it
>>> to be removed.
>>>
>>> Add a memory notifier to prevent the removal of memory regions that
>>> overlap with a loaded kexec image segment. e.g., when triggered from the
>>> Qemu console:
>>> | kexec_core: memory region in use
>>> | memory memory32: Offline failed.
>>>
>>> Signed-off-by: James Morse <james.morse@arm.com>
>>> ---
>>>  kernel/kexec_core.c | 56 +++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 56 insertions(+)
>>>
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index c19c0dad1ebe..ba1d91e868ca 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
> 
>> E.g., in kernel/kexec_core.c:kimage_alloc_pages()
>>
>> "SetPageReserved(pages + i);"
>>
>> Pages that are reserved cannot get offlined. How are you able to trigger
>> that before this patch? (where is the allocation path for kexec, which
>> will not set the pages reserved?)
> 
> This sets page reserved on the memory it gets back from
> alloc_pages() in kimage_alloc_pages(). This is when you load the image[0].
> 
> The problem I see is for the target or destination memory once you execute the
> image. Once machine_kexec() runs, it tries to write to this, assuming it is
> still present...

Let's recap

1. You load the image. You allocate memory for e.g., the kexec kernel.
The pages will be marked PG_reserved, so they cannot be offlined.

2. You do the kexec. The kexec kernel will only operate on a reserved
memory region (reserved via e.g., kernel cmdline crashkernel=128M).

Is it that in 2., the reserved memory region (for the crashkernel) could
have been offlined in the meantime?
James Morse March 27, 2020, 6:07 p.m. UTC | #8
Hi David,

On 3/27/20 5:06 PM, David Hildenbrand wrote:
> On 27.03.20 17:56, James Morse wrote:
>> On 3/27/20 9:30 AM, David Hildenbrand wrote:
>>> On 26.03.20 19:07, James Morse wrote:
>>>> An image loaded for kexec is not stored in place, instead its segments
>>>> are scattered through memory, and are re-assembled when needed. In the
>>>> meantime, the target memory may have been removed.
>>>>
>>>> Because mm is not aware that this memory is still in use, it allows it
>>>> to be removed.
>>>>
>>>> Add a memory notifier to prevent the removal of memory regions that
>>>> overlap with a loaded kexec image segment. e.g., when triggered from the
>>>> Qemu console:
>>>> | kexec_core: memory region in use
>>>> | memory memory32: Offline failed.

>>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>>> index c19c0dad1ebe..ba1d91e868ca 100644
>>>> --- a/kernel/kexec_core.c
>>>> +++ b/kernel/kexec_core.c
>>
>>> E.g., in kernel/kexec_core.c:kimage_alloc_pages()
>>>
>>> "SetPageReserved(pages + i);"
>>>
>>> Pages that are reserved cannot get offlined. How are you able to trigger
>>> that before this patch? (where is the allocation path for kexec, which
>>> will not set the pages reserved?)
>>
>> This sets page reserved on the memory it gets back from
>> alloc_pages() in kimage_alloc_pages(). This is when you load the image[0].
>>
>> The problem I see is for the target or destination memory once you execute the
>> image. Once machine_kexec() runs, it tries to write to this, assuming it is
>> still present...

> Let's recap
> 
> 1. You load the image. You allocate memory for e.g., the kexec kernel.
> The pages will be marked PG_reserved, so they cannot be offlined.
> 
> 2. You do the kexec. The kexec kernel will only operate on a reserved
> memory region (reserved via e.g., kernel cmdline crashkernel=128M).

I think you are merging the kexec and kdump behaviours.
(Wrong terminology? The things behind 'kexec -l Image' and 'kexec -p Image')

For kdump, yes, the new kernel is loaded into the crashkernel reservation, and
confined to it.


For regular kexec, the new kernel can be loaded any where in memory. There might
be a difference with how this works on arm64....

The regular kexec kernel isn't stored in its final location when its loaded, its
relocated there when the image is executed. The target/destination memory may
have been removed in the meantime.

(an example recipe below should clarify this)


> Is it that in 2., the reserved memory region (for the crashkernel) could
> have been offlined in the meantime?

No, for kdump: the crashkernel reservation is PG_reserved, and its not something
mm knows how to move, so that region can't be taken offline.

(On arm64 we additionally prevent the boot-memory from being removed as it is
all described as present by UEFI. The crashkernel reservation would always be
from this type of memory)


This is about a regular kexec, any crashdump reservation is irrelevant.
This kexec kernel is temporarily stored out of line, then relocated when executed.

A recipe so that we're at least on the same terminal! This is on a TX2 running
arm64's for-next/core using Qemu-TCG to emulate x86. (Sorry for the bizarre
config, its because Qemu supports hotremove on x86, but not yet on arm64).


Insert the memory:
(qemu) object_add memory-backend-ram,id=mem1,size=1G
(qemu) device_add pc-dimm,id=dimm1,memdev=mem1

| root@vm:~# free -m
|               total        used        free      shared  ...
| Mem:            918          52         814           0  ...
| Swap:             0           0           0


Bring it online:
| root@vm:~# cd /sys/devices/system/memory/
| root@vm:/sys/devices/system/memory# for F in memory3*; do echo \
| online_movable > $F/state; done

| Built 1 zonelists, mobility grouping on.  Total pages: 251049
| Policy zone: DMA32

| -bash: echo: write error: Invalid argument
| root@vm:/sys/devices/system/memory# free -m
|               total        used        free      shared  ...
| Mem:           1942          53        1836           0  ...
| Swap:             0           0           0


Load kexec:
| root@vm:/sys/devices/system/memory# kexec -l /root/bzImage --reuse-cmdline

Press the Attention button to request removal:

(qemu) device_del dimm1

| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Built 1 zonelists, mobility grouping on.  Total pages: 233728
| Policy zone: DMA32

The memory is gone:
| root@vm:/sys/devices/system/memory# free -m
|               total        used        free      shared  ...
| Mem:            918          89         769           0  ...
| Swap:             0           0           0

Trigger kexec:
| root@vm:/sys/devices/system/memory# kexec -e

[...]

| sd 0:0:0:0: [sda] Synchronizing SCSI cache
| kexec_core: Starting new kernel

... and Qemu restarts the platform firmware instead of proceeding with kexec.
(I assume this is a triple fault)

You can use mem-min and mem-max to control where kexec's user space will place
the memory.


If you apply this patch, the above sequence will fail at the device remove step,
as the physical addresses match the loaded kexec image:

| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| Offlined Pages 32768
| kexec_core: Memory region in use
| kexec_core: Memory region in use
| memory memory39: Offline failed.
| Built 1 zonelists, mobility grouping on.  Total pages: 299212
| Policy zone: Normal

| root@vm:/sys/devices/system/memory# free -m
|               total        used        free      shared  ...
| Mem:           1942          90        1793           0    ...
| Swap:             0           0           0

I can't remove the DIMM, because we failed to offline it:

(qemu) object_del mem1
object 'mem1' is in use, can not be deleted

and I can trigger kexec and boot the new kernel.

kexec user-space here comes from debian bullseye. It picked the removable memory
all by itself without any additional arguments.


(a different issue that can be ignored for now: x86 additionally fails to reboot
if I remove memory, even if its not in use by the kexec image. This doesn't
cause qemu to reboot via firmware, I think it dies before the console. It
doesn't happen on arm64. I suspect the memory map is snapshotted and assumed to
still be correct when the image is executed.)


Thanks,

James
David Hildenbrand March 27, 2020, 6:52 p.m. UTC | #9
>> 2. You do the kexec. The kexec kernel will only operate on a reserved
>> memory region (reserved via e.g., kernel cmdline crashkernel=128M).
> 
> I think you are merging the kexec and kdump behaviours.
> (Wrong terminology? The things behind 'kexec -l Image' and 'kexec -p Image')

Oh, I see - I think your example below clarifies things. Something like
that should go in the cover letter if we end up in this patch being
required :)

(I missed that the problematic part is "random" addresses passed by user
space to the kernel, where it wants data to be loaded to on kexec -e)

> 
> For kdump, yes, the new kernel is loaded into the crashkernel reservation, and
> confined to it.
> 
> 
> For regular kexec, the new kernel can be loaded any where in memory. There might
> be a difference with how this works on arm64....
> 
> The regular kexec kernel isn't stored in its final location when its loaded, its
> relocated there when the image is executed. The target/destination memory may
> have been removed in the meantime.
> 
> (an example recipe below should clarify this)
> 
> 
>> Is it that in 2., the reserved memory region (for the crashkernel) could
>> have been offlined in the meantime?
> 
> No, for kdump: the crashkernel reservation is PG_reserved, and its not something
> mm knows how to move, so that region can't be taken offline.
> 
> (On arm64 we additionally prevent the boot-memory from being removed as it is
> all described as present by UEFI. The crashkernel reservation would always be
> from this type of memory)

Right.

> 
> 
> This is about a regular kexec, any crashdump reservation is irrelevant.
> This kexec kernel is temporarily stored out of line, then relocated when executed.
> 
> A recipe so that we're at least on the same terminal! This is on a TX2 running
> arm64's for-next/core using Qemu-TCG to emulate x86. (Sorry for the bizarre
> config, its because Qemu supports hotremove on x86, but not yet on arm64).
> 
> 
> Insert the memory:
> (qemu) object_add memory-backend-ram,id=mem1,size=1G
> (qemu) device_add pc-dimm,id=dimm1,memdev=mem1
> 
> | root@vm:~# free -m
> |               total        used        free      shared  ...
> | Mem:            918          52         814           0  ...
> | Swap:             0           0           0
> 
> 
> Bring it online:
> | root@vm:~# cd /sys/devices/system/memory/
> | root@vm:/sys/devices/system/memory# for F in memory3*; do echo \
> | online_movable > $F/state; done
> 
> | Built 1 zonelists, mobility grouping on.  Total pages: 251049
> | Policy zone: DMA32
> 
> | -bash: echo: write error: Invalid argument
> | root@vm:/sys/devices/system/memory# free -m
> |               total        used        free      shared  ...
> | Mem:           1942          53        1836           0  ...
> | Swap:             0           0           0
> 
> 
> Load kexec:
> | root@vm:/sys/devices/system/memory# kexec -l /root/bzImage --reuse-cmdline
> 

I assume this will trigger

kexec_load -> do_kexec_load -> kimage_load_segment ->
kimage_load_normal_segment -> kimage_alloc_page -> kimage_alloc_pages

Which will just allocate a bunch of pages and mark them reserved.

Now, AFAIKs, all allocations will be unmovable. So none of the kexec
segment allocations will actually end up on your DIMM (as it is onlined
online_movable).

So, the loaded image (with its segments) from user won't be problematic
and not get placed on your DIMM.


Now, the problematic part is (via man kexec_load) "mem and memsz specify
a physical address range that is the target of the copy."

So the place where the image will be "assembled" at when doing the
reboot. Understood :)

> Press the Attention button to request removal:
> 
> (qemu) device_del dimm1
> 
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Built 1 zonelists, mobility grouping on.  Total pages: 233728
> | Policy zone: DMA32
> 
> The memory is gone:
> | root@vm:/sys/devices/system/memory# free -m
> |               total        used        free      shared  ...
> | Mem:            918          89         769           0  ...
> | Swap:             0           0           0
> 
> Trigger kexec:
> | root@vm:/sys/devices/system/memory# kexec -e
> 
> [...]
> 
> | sd 0:0:0:0: [sda] Synchronizing SCSI cache
> | kexec_core: Starting new kernel
> 
> ... and Qemu restarts the platform firmware instead of proceeding with kexec.
> (I assume this is a triple fault)
> 
> You can use mem-min and mem-max to control where kexec's user space will place
> the memory.
> 
> 
> If you apply this patch, the above sequence will fail at the device remove step,
> as the physical addresses match the loaded kexec image:
> 
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | Offlined Pages 32768
> | kexec_core: Memory region in use
> | kexec_core: Memory region in use

Okay, so I assume the kexec userspace tool provided target kernel
addresses for segments that reside on the DIMM.


> | memory memory39: Offline failed.
> | Built 1 zonelists, mobility grouping on.  Total pages: 299212
> | Policy zone: Normal
> 
> | root@vm:/sys/devices/system/memory# free -m
> |               total        used        free      shared  ...
> | Mem:           1942          90        1793           0    ...
> | Swap:             0           0           0
> 
> I can't remove the DIMM, because we failed to offline it:

I wonder if we should instead make the "kexec -e" fail. It tries to
touch random system memory.

Denying to offline MOVABLE memory should be avoided - and what kexec
does here sounds dangerous to me (allowing it to write random system
memory).

Roughly what I am thinking is this:

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index ba1d91e868ca..70c39a5307e5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1135,6 +1135,10 @@ int kernel_kexec(void)
                error = -EINVAL;
                goto Unlock;
        }
+       if (!kexec_image_validate()) {
+               error = -EINVAL;
+               goto Unlock;
+       }

 #ifdef CONFIG_KEXEC_JUMP
        if (kexec_image->preserve_context) {


kexec_image_validate() would go over all segments and validate that the
involved pages are actual valid memory (pfn_to_online_page()).

All we have to do is protect from memory hotplug until we switch to the
new kernel.

Will probably need some thought. But it will actually also bail out when
user space passes wrong physical memory addresses, instead of
triple-faulting silently.
James Morse March 30, 2020, 1 p.m. UTC | #10
Hi David,

On 3/27/20 6:52 PM, David Hildenbrand wrote:
>>> 2. You do the kexec. The kexec kernel will only operate on a reserved
>>> memory region (reserved via e.g., kernel cmdline crashkernel=128M).
>>
>> I think you are merging the kexec and kdump behaviours.
>> (Wrong terminology? The things behind 'kexec -l Image' and 'kexec -p Image')
> 
> Oh, I see - I think your example below clarifies things. Something like
> that should go in the cover letter if we end up in this patch being
> required :)

Do you mean the commit message? I think its far too long...

Adding a sentence about the way kexec load works may help, the first paragraph
would read:

| Kexec allows user-space to specify the address that the kexec image should be
| loaded to. Because this memory may be in use, an image loaded for kexec is not
| stored in place, instead its segments are scattered through memory, and are
| re-assembled when needed. In the meantime, the target memory may have been
| removed.

Do you think thats clearer?


> (I missed that the problematic part is "random" addresses passed by user
> space to the kernel, where it wants data to be loaded to on kexec -e)

[...]

>> Load kexec:
>> | root@vm:/sys/devices/system/memory# kexec -l /root/bzImage --reuse-cmdline
>>
> 
> I assume this will trigger
> 
> kexec_load -> do_kexec_load -> kimage_load_segment ->
> kimage_load_normal_segment -> kimage_alloc_page -> kimage_alloc_pages
> 
> Which will just allocate a bunch of pages and mark them reserved.
> 
> Now, AFAIKs, all allocations will be unmovable. So none of the kexec
> segment allocations will actually end up on your DIMM (as it is onlined
> online_movable).
> 
> So, the loaded image (with its segments) from user won't be problematic
> and not get placed on your DIMM.
> 
> 
> Now, the problematic part is (via man kexec_load) "mem and memsz specify
> a physical address range that is the target of the copy."
> 
> So the place where the image will be "assembled" at when doing the
> reboot. Understood :)

Yup.

[...]

> I wonder if we should instead make the "kexec -e" fail. It tries to
> touch random system memory.

Heh, isn't touching random system memory what kexec does?!

Its all described to user-space as 'System RAM'. Teaching it to probe
/sys/devices/memory/... would require a user-space change.


> Denying to offline MOVABLE memory should be avoided - and what kexec
> does here sounds dangerous to me (allowing it to write random system
> memory).

> Roughly what I am thinking is this:
> 
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index ba1d91e868ca..70c39a5307e5 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -1135,6 +1135,10 @@ int kernel_kexec(void)
>                 error = -EINVAL;
>                 goto Unlock;
>         }
> +       if (!kexec_image_validate()) {
> +               error = -EINVAL;
> +               goto Unlock;
> +       }
> 
>  #ifdef CONFIG_KEXEC_JUMP
>         if (kexec_image->preserve_context) {
> 
> 
> kexec_image_validate() would go over all segments and validate that the
> involved pages are actual valid memory (pfn_to_online_page()).
> 
> All we have to do is protect from memory hotplug until we switch to the
> new kernel.

(migrate_to_reboot_cpu() can sleep), I think you'd end up with something like
this patch, but only while kexec_in_progress. I don't think letting kexec fail
if the events occur in a different order is good for user-space.


> Will probably need some thought. But it will actually also bail out when
> user space passes wrong physical memory addresses, instead of
> triple-faulting silently.

With this change, the reboot(LINUX_REBOOT_CMD_KEXEC), call would fail. This
thing doesn't usually return, so we're likely to trigger error-handling that has
never run before.

(Last time I debugged one of these, it turned out kexec had taken the network
interfaces down, meaning the nfsroot was no longer accessible)

How can user-space know whether kexec is going to succeed, or fail like this?
Any loaded kexec kernel could secretly be in this broken state.

Can user-space know what caused this to become unreliable? (without reading the
kernel source)


Given kexec can be unloaded by user-space, I think its better to prevent us
getting into the broken state, preferably giving the hint that kexec us using
that memory. The user can 'kexec -u', then retry removing the memory.

I think forbidding the memory-offline is simpler for user-space to deal with.


Thanks,

James
David Hildenbrand March 30, 2020, 1:13 p.m. UTC | #11
> Adding a sentence about the way kexec load works may help, the first paragraph
> would read:
> 
> | Kexec allows user-space to specify the address that the kexec image should be
> | loaded to. Because this memory may be in use, an image loaded for kexec is not
> | stored in place, instead its segments are scattered through memory, and are
> | re-assembled when needed. In the meantime, the target memory may have been
> | removed.
> 
> Do you think thats clearer?

Yes, very much. Maybe add, that the target is described by user space
during kexec_load() and that user space - right now - parses /proc/iomem
to find applicable system memory.

> [...]
> 
>>> Load kexec:
>>> | root@vm:/sys/devices/system/memory# kexec -l /root/bzImage --reuse-cmdline
>>>
>>
>> I assume this will trigger
>>
>> kexec_load -> do_kexec_load -> kimage_load_segment ->
>> kimage_load_normal_segment -> kimage_alloc_page -> kimage_alloc_pages
>>
>> Which will just allocate a bunch of pages and mark them reserved.
>>
>> Now, AFAIKs, all allocations will be unmovable. So none of the kexec
>> segment allocations will actually end up on your DIMM (as it is onlined
>> online_movable).
>>
>> So, the loaded image (with its segments) from user won't be problematic
>> and not get placed on your DIMM.
>>
>>
>> Now, the problematic part is (via man kexec_load) "mem and memsz specify
>> a physical address range that is the target of the copy."
>>
>> So the place where the image will be "assembled" at when doing the
>> reboot. Understood :)
> 
> Yup.
> 
> [...]
> 
>> I wonder if we should instead make the "kexec -e" fail. It tries to
>> touch random system memory.
> 
> Heh, isn't touching random system memory what kexec does?!

Having a racy user interface that can trigger kernel crashes feels very
wrong. We should limit the impact.

> 
> Its all described to user-space as 'System RAM'. Teaching it to probe
> /sys/devices/memory/... would require a user-space change.

I think we should really rename hotplugged memory on all architectures.

Especially also relevant for virtio-mem/hyper-v balloon, where some
pieces of (hotplugged )memory blocks are partially unavailable and
should not be touched - accessing them results in unpredictable behavior
(e.g., crashes or discarded writes).

[...]

>> Will probably need some thought. But it will actually also bail out when
>> user space passes wrong physical memory addresses, instead of
>> triple-faulting silently.
> 
> With this change, the reboot(LINUX_REBOOT_CMD_KEXEC), call would fail. This
> thing doesn't usually return, so we're likely to trigger error-handling that has
> never run before.
> 
> (Last time I debugged one of these, it turned out kexec had taken the network
> interfaces down, meaning the nfsroot was no longer accessible)
> 
> How can user-space know whether kexec is going to succeed, or fail like this?
> Any loaded kexec kernel could secretly be in this broken state.
> 
> Can user-space know what caused this to become unreliable? (without reading the
> kernel source)
> 
> 
> Given kexec can be unloaded by user-space, I think its better to prevent us
> getting into the broken state, preferably giving the hint that kexec us using
> that memory. The user can 'kexec -u', then retry removing the memory.
> 
> I think forbidding the memory-offline is simpler for user-space to deal with.

I thought about this over the weekend, and I don't think it's the right
approach.

1. It's racy. If memory is getting offlined/unplugged just while user
space is about to trigger the kexec_load(), you end up with the very
same triple-fault.

2. It's semantically wrong. kexec does not need online memory ("managed
by the buddy"), but still you disallow offlining memory.


I would really much rather want to see user-space choosing boot memory
(e.g., renaming hotplugged memory on all architectures), and checking
during "kexec -e" if the selected memory is actually "there", before
trying to write to it.
James Morse March 30, 2020, 5:17 p.m. UTC | #12
Hi David,

On 3/30/20 2:13 PM, David Hildenbrand wrote:
>> Adding a sentence about the way kexec load works may help, the first paragraph
>> would read:
>>
>> | Kexec allows user-space to specify the address that the kexec image should be
>> | loaded to. Because this memory may be in use, an image loaded for kexec is not
>> | stored in place, instead its segments are scattered through memory, and are
>> | re-assembled when needed. In the meantime, the target memory may have been
>> | removed.
>>
>> Do you think thats clearer?
> 
> Yes, very much. Maybe add, that the target is described by user space
> during kexec_load() and that user space - right now - parses /proc/iomem
> to find applicable system memory.

(I don't think x86 parses /proc/iomem anymore). I'll repost this patch with that
expanded commit message, once we've agreed this is the right thing to do!


>>> I wonder if we should instead make the "kexec -e" fail. It tries to
>>> touch random system memory.
>>
>> Heh, isn't touching random system memory what kexec does?!
> 
> Having a racy user interface that can trigger kernel crashes feels very
> wrong. We should limit the impact.


>> Its all described to user-space as 'System RAM'. Teaching it to probe
>> /sys/devices/memory/... would require a user-space change.
> 
> I think we should really rename hotplugged memory on all architectures.
> 
> Especially also relevant for virtio-mem/hyper-v balloon, where some
> pieces of (hotplugged )memory blocks are partially unavailable and
> should not be touched - accessing them results in unpredictable behavior
> (e.g., crashes or discarded writes).

I'll need to look into these. I'd assume for KVM that virtio-mem can be brought
back when its accessed ... its just going to be slow.


>>> Will probably need some thought. But it will actually also bail out when
>>> user space passes wrong physical memory addresses, instead of
>>> triple-faulting silently.
>>
>> With this change, the reboot(LINUX_REBOOT_CMD_KEXEC), call would fail. This
>> thing doesn't usually return, so we're likely to trigger error-handling that has
>> never run before.
>>
>> (Last time I debugged one of these, it turned out kexec had taken the network
>> interfaces down, meaning the nfsroot was no longer accessible)
>>
>> How can user-space know whether kexec is going to succeed, or fail like this?
>> Any loaded kexec kernel could secretly be in this broken state.
>>
>> Can user-space know what caused this to become unreliable? (without reading the
>> kernel source)
>>
>>
>> Given kexec can be unloaded by user-space, I think its better to prevent us
>> getting into the broken state, preferably giving the hint that kexec us using
>> that memory. The user can 'kexec -u', then retry removing the memory.
>>
>> I think forbidding the memory-offline is simpler for user-space to deal with.
> 
> I thought about this over the weekend, and I don't think it's the right
> approach.


> 1. It's racy. If memory is getting offlined/unplugged just while user
> space is about to trigger the kexec_load(), you end up with the very
> same triple-fault.

load? How is this different to user-space providing a bogus address?

Sure, user-space may take a nap between parsing /proc/iomem and calling
kexec_load(), but the kernel should reject these as they would never work.

(I can't see where sanity_check_segment_list() considers the platform's memory.
If it doesn't, we should fix it)

Once the image is loaded, and clashes with a request to remove the memory there
are two choices: secretly unload the image, or prevent the memory being taken
offline.


> 2. It's semantically wrong. kexec does not need online memory ("managed
> by the buddy"), but still you disallow offlining memory.

It does need the memory if you want 'kexec -e' to succeed.
If there were any sanity tests, they should have happened at load time.

The memory is effectively in use by the loaded kexec image. User-space told the
kernel to use this memory, you should not be able to then remove it, without
unloading the kexec image first.


Are you saying feeding bogus addresses to kexec_load() is _expected_ to blow up
like this?


> I would really much rather want to see user-space choosing boot memory
> (e.g., renaming hotplugged memory on all architectures), and checking
> during "kexec -e" if the selected memory is actually "there", before
> trying to write to it.

How does 'kexec -e' know where the kexec kernel was loaded? You'd need to pass
something between 'load' and 'exec'. How do you keep existing user-space working
as much as possible?

What do you do if the memory isn't there? User-space just called reboot(), it
would be better to avoid getting into the situation where we have to fail that call.


Solving the bigger problem, would add a 'kexec_it_now' flag to the kexec_load()
call. This would make the window where 'stuff' can change much smaller. Things
changing while user-space sleeps isn't a solvable problem, these would need to
be rejected by sanity tests at load time.


Thanks,

James
David Hildenbrand March 30, 2020, 6:14 p.m. UTC | #13
On 30.03.20 19:17, James Morse wrote:
> Hi David,
> 
> On 3/30/20 2:13 PM, David Hildenbrand wrote:
>>> Adding a sentence about the way kexec load works may help, the first paragraph
>>> would read:
>>>
>>> | Kexec allows user-space to specify the address that the kexec image should be
>>> | loaded to. Because this memory may be in use, an image loaded for kexec is not
>>> | stored in place, instead its segments are scattered through memory, and are
>>> | re-assembled when needed. In the meantime, the target memory may have been
>>> | removed.
>>>
>>> Do you think thats clearer?
>>
>> Yes, very much. Maybe add, that the target is described by user space
>> during kexec_load() and that user space - right now - parses /proc/iomem
>> to find applicable system memory.
> 
> (I don't think x86 parses /proc/iomem anymore). I'll repost this patch with that
> expanded commit message, once we've agreed this is the right thing to do!

Right, I can see kexec-tools parsing /sys/firmware/memmap first.
Unfortunately, all hotplugged memory (via add_memory()) is indicated
there as System RAM ... including memory added by virtio-mem.

I think we should adapt the type there as well. (in your patch #2)

	firmware_map_add_hotplug(start, start + size, "System RAM");

> 
> 
>>>> I wonder if we should instead make the "kexec -e" fail. It tries to
>>>> touch random system memory.
>>>
>>> Heh, isn't touching random system memory what kexec does?!
>>
>> Having a racy user interface that can trigger kernel crashes feels very
>> wrong. We should limit the impact.
> 
> 
>>> Its all described to user-space as 'System RAM'. Teaching it to probe
>>> /sys/devices/memory/... would require a user-space change.
>>
>> I think we should really rename hotplugged memory on all architectures.
>>
>> Especially also relevant for virtio-mem/hyper-v balloon, where some
>> pieces of (hotplugged )memory blocks are partially unavailable and
>> should not be touched - accessing them results in unpredictable behavior
>> (e.g., crashes or discarded writes).
> 
> I'll need to look into these. I'd assume for KVM that virtio-mem can be brought
> back when its accessed ... its just going to be slow.

Touching unplugged virtio-mem memory can result in unpredictable
behavior. Touching (some) unplugged Hyper-V memory will be handled
similarly AFAIK.

[...]

>> 1. It's racy. If memory is getting offlined/unplugged just while user
>> space is about to trigger the kexec_load(), you end up with the very
>> same triple-fault.
> 
> load? How is this different to user-space providing a bogus address?

I guess it's not different. It's just racy because user space with good
intend could crash the system :)

> 
> Sure, user-space may take a nap between parsing /proc/iomem and calling
> kexec_load(), but the kernel should reject these as they would never work.
> 
> (I can't see where sanity_check_segment_list() considers the platform's memory.
> If it doesn't, we should fix it)

Right, that's what I meant. I was not able to find any sanity checks.
Maybe they are in place but I was not able to spot them.

> 
> Once the image is loaded, and clashes with a request to remove the memory there
> are two choices: secretly unload the image, or prevent the memory being taken
> offline.

Exactly. Or make "kexec -e" fail.

> 
> 
>> 2. It's semantically wrong. kexec does not need online memory ("managed
>> by the buddy"), but still you disallow offlining memory.
> 
> It does need the memory if you want 'kexec -e' to succeed.
> If there were any sanity tests, they should have happened at load time.

Offlining != removing. That's the point I was trying to make. (and we
don't want to block removing of memory in the kernel any other way)

> 
> The memory is effectively in use by the loaded kexec image. User-space told the
> kernel to use this memory, you should not be able to then remove it, without
> unloading the kexec image first.

It's not in use before you do the "kexec -e" IMHO.

> Are you saying feeding bogus addresses to kexec_load() is _expected_ to blow up
> like this?

No, not at all. I think this should be fixed if this is possible.

> 
>> I would really much rather want to see user-space choosing boot memory
>> (e.g., renaming hotplugged memory on all architectures), and checking
>> during "kexec -e" if the selected memory is actually "there", before
>> trying to write to it.
> 
> How does 'kexec -e' know where the kexec kernel was loaded? You'd need to pass
> something between 'load' and 'exec'. How do you keep existing user-space working
> as much as possible?

If we use new types (e.g., "System RAM (hotplugged)"), looks like most
of kexec will continue working (memory will be treated like
RANGE_RESERVED or ignored).

I guess we would still have to teach kexec-tools the new types,
primarily to keep the crash memory ranges from getting detected
properly. (no idea how they are used, will have to take a closer look)

> 
> What do you do if the memory isn't there? User-space just called reboot(), it
> would be better to avoid getting into the situation where we have to fail that call.

In kernel_kexec() we already fail if there is no kernel image loaded, so
we can similarly simply fail if the kernel image cannot be moved to the
target memory IMHO.
Andrew Morton April 10, 2020, 7:10 p.m. UTC | #14
It's unclear (to me) what is the status of this patchset.  But it does appear that
an new version can be expected?
Baoquan He April 11, 2020, 3:44 a.m. UTC | #15
On 04/10/20 at 12:10pm, Andrew Morton wrote:
> It's unclear (to me) what is the status of this patchset.  But it does appear that
> an new version can be expected?

As we discussed in the thread of replying to the cover letter, the
idea of this patchset is not good. 

Because We tend to use kexec_file_load more and improve/enhance it in the
future, and gradually obsolete the old kexec_load interface which this
patchset is trying to fix on. 

And the issue James spot is a very corner case, we have suggested
another easier way to avoid it by adding systemd service to load kexec
and monitor memory adding/removing uevent, juas as we have done for
kdump loading. Bhupesh is working on this to add a service in Fedora
and test, and will put it to RHEL too if nobody is unsatisfied.

Thanks
Baoquan
Russell King (Oracle) April 11, 2020, 9:30 a.m. UTC | #16
On Sat, Apr 11, 2020 at 11:44:14AM +0800, Baoquan He wrote:
> Because We tend to use kexec_file_load more and improve/enhance it in the
> future, and gradually obsolete the old kexec_load interface which this
> patchset is trying to fix on. 

That's not going to happen; 32-bit ARM kexec uses the kexec_load
interface rather than the kexec_file_load version, and I see no one
with any interest in changing that - and there's users of the former.

I don't see how it's possible to convert 32-bit ARM kexec to the
kexec_file_load interface - this assumes that all you have are the
kernel, initrd, and commandline, but on 32-bit ARM kexec, we have
kernel, initrd and the dtb blob which the user can specify.

So, if we wanted to obsolete the kexec_load interface, _first_ there
needs to be a way to provide users with the existing functionality
they have already in place on 32-bit ARM - otherwise we're looking
at a userspace regression.  Especially as kexec_file_load takes
precedence on some distro patched versions of the kexec tool,
irrespective of which interface the user requests of the tool.
David Hildenbrand April 11, 2020, 9:58 a.m. UTC | #17
> Am 11.04.2020 um 11:40 schrieb Russell King - ARM Linux admin <linux@armlinux.org.uk>:
> 
> On Sat, Apr 11, 2020 at 11:44:14AM +0800, Baoquan He wrote:
>> Because We tend to use kexec_file_load more and improve/enhance it in the
>> future, and gradually obsolete the old kexec_load interface which this
>> patchset is trying to fix on. 
> 
> That's not going to happen; 32-bit ARM kexec uses the kexec_load
> interface rather than the kexec_file_load version, and I see no one
> with any interest in changing that - and there's users of the former.
> 
> I don't see how it's possible to convert 32-bit ARM kexec to the
> kexec_file_load interface - this assumes that all you have are the
> kernel, initrd, and commandline, but on 32-bit ARM kexec, we have
> kernel, initrd and the dtb blob which the user can specify.
> 
> So, if we wanted to obsolete the kexec_load interface, _first_ there
> needs to be a way to provide users with the existing functionality
> they have already in place on 32-bit ARM - otherwise we're looking
> at a userspace regression.  Especially as kexec_file_load takes
> precedence on some distro patched versions of the kexec tool,
> irrespective of which interface the user requests of the tool.
> 

On 32bit architectures we usually don‘t really care about memory hotplug. So we could deprecate it only for 64bit architectures AFAIKS.


> -- 
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 10.2Mbps down 587kbps up
>
Baoquan He April 12, 2020, 5:35 a.m. UTC | #18
On 04/11/20 at 10:30am, Russell King - ARM Linux admin wrote:
> On Sat, Apr 11, 2020 at 11:44:14AM +0800, Baoquan He wrote:
> > Because We tend to use kexec_file_load more and improve/enhance it in the
> > future, and gradually obsolete the old kexec_load interface which this
> > patchset is trying to fix on. 
> 
> That's not going to happen; 32-bit ARM kexec uses the kexec_load
> interface rather than the kexec_file_load version, and I see no one
> with any interest in changing that - and there's users of the former.
> 
> I don't see how it's possible to convert 32-bit ARM kexec to the
> kexec_file_load interface - this assumes that all you have are the
> kernel, initrd, and commandline, but on 32-bit ARM kexec, we have
> kernel, initrd and the dtb blob which the user can specify.

Well, I understand what you said about 32-bit ARM support with only
kexec_old support thing. That's why I said we tend to obsolete it
'GRADUALLY'. It's the existing users who are using kexec_load, and the
ARCHes which only has kexec_load, make us have to transfer to
kexec_file_load gradually.

Comparing with kexec_load, kexec_file_load has only one disadvantage,
that is some ARCHes only have kexec_load. Otherwise, kexec_file_load
benefits kexec/kdump developping/maintaining very much. The loading job
of kexec_file_load is mostly done in kernel, we can get whatever we
want about kernel information very conveniently to do anything needed.
For the kexec_load interface, the loading job is mostly done in
userspace, we have to export kernel information to procfs, sysfs, etc,
then parse them in kexec_tools, finally passed it to kernel part of
kexec loading.

The gradual obsoleting means we may only add
feature/improvement/enhancement to kexec_file_load. And if a bug fix is
needed for both kexec_load and kexec_file_load, and the fix is very
complicated, we may only fix it in kexec_file_load too. Kexec_file_load
interface is suggested to add if does't have, just port user space part
to kernel as x86/s390/arm64 have done.

Surely, it doesn't mean we don't fix the critical/blocker bug with
kexec_load loading. We still try to do, just are not so eager. In the
existing product environment, the kexec_load is used, just keep using
it. Do we bother to change it to kexec_file_load, e.g in our RHEL7
distros? Certainly not. But in our new product, we will change to use
kexec_file_load interface. I guess this is similar with arm64. The
advantage and benefit have been told in the 2nd paragraph.


As for 32-bit ARM, is it like the old product, we have many in-use systems
deployed in customers' laboratory? Wondering if ARM continues designing
new 32-bit ARM cpu, and some companies continue producing tons of 32-bit ARM
cpus. If yes, I think we need continue taking care of kexec_load if
32-bit ARM can't convert to kexec_file_load. If not, it may be not a
barrier when we consider converting kexec_load to kexec_file_load in
other ARCHes. We just need keep using it, try to fix those critical/blocker
bug in kexec_load interface if encountered.

Finally, comning back to this patchset itself, the issue James spotted
is not so ciritical, I would say. When I do kexec jumping, I will do
loading firstly, then trigge jumping. I can think of the case that
people may load kexec-ed kernel, then do something else, later she/he
triggers the kexec jumping. These are not necessary steps. As Dave and I
replied to James in the cover-letter thread, adding a systemd service of
kexec loading, monitor hotplug uevent, reload it if any hot remove
happened. This is quite easy to do, I don't see any problem with it, and
why we don't do like this. 

My personal opinion, please tell if I miss anything.

> 
> So, if we wanted to obsolete the kexec_load interface, _first_ there
> needs to be a way to provide users with the existing functionality
> they have already in place on 32-bit ARM - otherwise we're looking
> at a userspace regression.  Especially as kexec_file_load takes
> precedence on some distro patched versions of the kexec tool,
> irrespective of which interface the user requests of the tool.
> 
> -- 
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line in suburbia: sync at 10.2Mbps down 587kbps up
>
Russell King (Oracle) April 12, 2020, 8:08 a.m. UTC | #19
On Sun, Apr 12, 2020 at 01:35:07PM +0800, Baoquan He wrote:
> On 04/11/20 at 10:30am, Russell King - ARM Linux admin wrote:
> > On Sat, Apr 11, 2020 at 11:44:14AM +0800, Baoquan He wrote:
> > > Because We tend to use kexec_file_load more and improve/enhance it in the
> > > future, and gradually obsolete the old kexec_load interface which this
> > > patchset is trying to fix on. 
> > 
> > That's not going to happen; 32-bit ARM kexec uses the kexec_load
> > interface rather than the kexec_file_load version, and I see no one
> > with any interest in changing that - and there's users of the former.
> > 
> > I don't see how it's possible to convert 32-bit ARM kexec to the
> > kexec_file_load interface - this assumes that all you have are the
> > kernel, initrd, and commandline, but on 32-bit ARM kexec, we have
> > kernel, initrd and the dtb blob which the user can specify.
> 
> Well, I understand what you said about 32-bit ARM support with only
> kexec_old support thing. That's why I said we tend to obsolete it
> 'GRADUALLY'. It's the existing users who are using kexec_load, and the
> ARCHes which only has kexec_load, make us have to transfer to
> kexec_file_load gradually.
> 
> Comparing with kexec_load, kexec_file_load has only one disadvantage,
> that is some ARCHes only have kexec_load. Otherwise, kexec_file_load
> benefits kexec/kdump developping/maintaining very much. The loading job
> of kexec_file_load is mostly done in kernel, we can get whatever we
> want about kernel information very conveniently to do anything needed.
> For the kexec_load interface, the loading job is mostly done in
> userspace, we have to export kernel information to procfs, sysfs, etc,
> then parse them in kexec_tools, finally passed it to kernel part of
> kexec loading.
> 
> The gradual obsoleting means we may only add
> feature/improvement/enhancement to kexec_file_load. And if a bug fix is
> needed for both kexec_load and kexec_file_load, and the fix is very
> complicated, we may only fix it in kexec_file_load too. Kexec_file_load
> interface is suggested to add if does't have, just port user space part
> to kernel as x86/s390/arm64 have done.
> 
> Surely, it doesn't mean we don't fix the critical/blocker bug with
> kexec_load loading. We still try to do, just are not so eager. In the
> existing product environment, the kexec_load is used, just keep using
> it. Do we bother to change it to kexec_file_load, e.g in our RHEL7
> distros? Certainly not. But in our new product, we will change to use
> kexec_file_load interface. I guess this is similar with arm64. The
> advantage and benefit have been told in the 2nd paragraph.
> 
> 
> As for 32-bit ARM, is it like the old product, we have many in-use systems
> deployed in customers' laboratory? Wondering if ARM continues designing
> new 32-bit ARM cpu, and some companies continue producing tons of 32-bit ARM
> cpus. If yes, I think we need continue taking care of kexec_load if
> 32-bit ARM can't convert to kexec_file_load. If not, it may be not a
> barrier when we consider converting kexec_load to kexec_file_load in
> other ARCHes. We just need keep using it, try to fix those critical/blocker
> bug in kexec_load interface if encountered.
> 
> Finally, comning back to this patchset itself, the issue James spotted
> is not so ciritical, I would say. When I do kexec jumping, I will do
> loading firstly, then trigge jumping. I can think of the case that
> people may load kexec-ed kernel, then do something else, later she/he
> triggers the kexec jumping. These are not necessary steps. As Dave and I
> replied to James in the cover-letter thread, adding a systemd service of
> kexec loading, monitor hotplug uevent, reload it if any hot remove
> happened. This is quite easy to do, I don't see any problem with it, and
> why we don't do like this. 
> 
> My personal opinion, please tell if I miss anything.

All that opinion and hand waving about the benefits of the new
interface is totally irrelevent for 32-bit ARM for the reasons
I stated in my email to which you replied.

Gradual obsolecence or not, the file interface can't be supported
on 32-bit ARM as-is - it is totally inadequate and inferior as an
API compared to the functionality we have with plain kexec_load.
Without that point addressed, kexec_file_load is meaningless for
32-bit ARM.
Eric W. Biederman April 12, 2020, 7:52 p.m. UTC | #20
The only benefit of kexec_file_load is that it is simple enough from a
kernel perspective that signatures can be checked.

kexec_load in every other respect is the more capable and functional
interface.  It makes no sense to get rid of it.

It does make sense to reload with a loaded kernel on memory hotplug.
That is simple and easy.  If we are going to handle something in the
kernel it should simple an automated unloading of the kernel on memory
hotplug.


I think it would be irresponsible to deprecate kexec_load on any
platform.

I also suspect that kexec_file_load could be taught to copy the dtb
on arm32 if someone wants to deal with signatures.

We definitely can not even think of deprecating kexec_load until
architecture that supports it also supports kexec_file_load and everyone
is happy with that interface.  That is Linus's no regression rule.


Eric
Bhupesh Sharma April 12, 2020, 8:37 p.m. UTC | #21
On Mon, Apr 13, 2020 at 1:26 AM Eric W. Biederman <ebiederm@xmission.com> wrote:
>
>
> The only benefit of kexec_file_load is that it is simple enough from a
> kernel perspective that signatures can be checked.
>
> kexec_load in every other respect is the more capable and functional
> interface.  It makes no sense to get rid of it.
>
> It does make sense to reload with a loaded kernel on memory hotplug.
> That is simple and easy.  If we are going to handle something in the
> kernel it should simple an automated unloading of the kernel on memory
> hotplug.
>
>
> I think it would be irresponsible to deprecate kexec_load on any
> platform.
>
> I also suspect that kexec_file_load could be taught to copy the dtb
> on arm32 if someone wants to deal with signatures.
>
> We definitely can not even think of deprecating kexec_load until
> architecture that supports it also supports kexec_file_load and everyone
> is happy with that interface.  That is Linus's no regression rule.

TBH, I have seen several active users of kexec_load on arm32
environments and we have been trying to help them with kexec issues on
arm32 in recent past as well.

So, I agree with Eric's view that probably deprecating this in favour
of kexec_file_load will break these existing environment.

I tried to do some work at the start of this year to add
kexec_file_load support for arm32 in my spare cycles, but I gave up as
the arm32 hardware had a broken firmware and couldn't boot latest
upstream kernel.

May be I try to find some spare cycles in the coming days to do it.

But I think since kexec_load is an important interface on these arm32
boards for supporting existing kexec-based bootloaders, we should
continue supporting the same until kexec_file_load is supported/mature
enough for arm32.

Thanks,
Bhupesh
Baoquan He April 13, 2020, 2:37 a.m. UTC | #22
On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> 
> The only benefit of kexec_file_load is that it is simple enough from a
> kernel perspective that signatures can be checked.

We don't have this restriction any more with below commit:

commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
and KEXEC_SIG_FORCE")

With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
secure boot or legacy system for kexec/kdump. Being simple enough is
enough to astract and convince us to use it instead. And kexec_file_load
has been in use for several years on systems with secure boot, since
added in 2014, on x86_64.

> 
> kexec_load in every other respect is the more capable and functional
> interface.  It makes no sense to get rid of it.
> 
> It does make sense to reload with a loaded kernel on memory hotplug.
> That is simple and easy.  If we are going to handle something in the
> kernel it should simple an automated unloading of the kernel on memory
> hotplug.
> 
> 
> I think it would be irresponsible to deprecate kexec_load on any
> platform.
> 
> I also suspect that kexec_file_load could be taught to copy the dtb
> on arm32 if someone wants to deal with signatures.
> 
> We definitely can not even think of deprecating kexec_load until
> architecture that supports it also supports kexec_file_load and everyone
> is happy with that interface.  That is Linus's no regression rule.

I should pick a milder word to express our tendency and tell our plan
then 'obsolete'. Even though I added 'gradually', seems it doesn't help
much. I didn't mean to say 'deprecate' at all when replied.

The situation and trend I understand about kexec_load and kexec_file_load
are:

1) Supporting kexec_file_load is suggested to add in ARCHes which don't
have yet, just as x86_64, arm64 and s390 have done;
 
2) kexec_file_load is suggested to use, and take precedence over
kexec_load in the future, if both are supported in one ARCH.

3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
and by ARCHes for back compatibility w/ kexec_file_load support.

For 1) and 2), I think the reason is obvious as Eric said,
kexec_file_load is simple enough. And currently, whenever we got a bug
report, we may need fix them twice, for kexec_load and kexec_file_load.
If kexec_file_load is made by default, e.g on x86_64, we will change it
in kernel space only, for kexec_file_load. This is what I meant about
'obsolete gradually'. I think for arm64, s390, they will do these too.
Unless there's some critical/blocker bug in kexec_load, to corrupt the
old kexec_load interface in old product.

For 3), people can still use kexec_load and develop/fix for it, if no
kexec_file_load supported. But 32-bit arm should be a different one,
more like i386, we will leave it as is, and fix anything which could
break it. But people really expects to improve or add feature to it? E.g
in this patchset, the mem hotplug issue James raised, I assume James is
focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
another reply, people even don't agree to continue supporting memory
hotplug on 32-bit system. We ever took effort to fix a memory hotplug
bug on i386 with a patch, but people would rather set it as BROKEN.
Eric W. Biederman April 13, 2020, 1:15 p.m. UTC | #23
Baoquan He <bhe@redhat.com> writes:

> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
>> 
>> The only benefit of kexec_file_load is that it is simple enough from a
>> kernel perspective that signatures can be checked.
>
> We don't have this restriction any more with below commit:
>
> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> and KEXEC_SIG_FORCE")
>
> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> secure boot or legacy system for kexec/kdump. Being simple enough is
> enough to astract and convince us to use it instead. And kexec_file_load
> has been in use for several years on systems with secure boot, since
> added in 2014, on x86_64.

No.  Actaully kexec_file_load is the less capable interface, and less
flexible interface.  Which is why it is appropriate for signature
verification.

>> kexec_load in every other respect is the more capable and functional
>> interface.  It makes no sense to get rid of it.
>> 
>> It does make sense to reload with a loaded kernel on memory hotplug.
>> That is simple and easy.  If we are going to handle something in the
>> kernel it should simple an automated unloading of the kernel on memory
>> hotplug.
>> 
>> 
>> I think it would be irresponsible to deprecate kexec_load on any
>> platform.
>> 
>> I also suspect that kexec_file_load could be taught to copy the dtb
>> on arm32 if someone wants to deal with signatures.
>> 
>> We definitely can not even think of deprecating kexec_load until
>> architecture that supports it also supports kexec_file_load and everyone
>> is happy with that interface.  That is Linus's no regression rule.
>
> I should pick a milder word to express our tendency and tell our plan
> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> much. I didn't mean to say 'deprecate' at all when replied.
>
> The situation and trend I understand about kexec_load and kexec_file_load
> are:
>
> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> have yet, just as x86_64, arm64 and s390 have done;
>  
> 2) kexec_file_load is suggested to use, and take precedence over
> kexec_load in the future, if both are supported in one ARCH.

The deep problem is that kexec_file_load is distinctly less expressive
than kexec_load.

> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> and by ARCHes for back compatibility w/ kexec_file_load support.
>
> For 1) and 2), I think the reason is obvious as Eric said,
> kexec_file_load is simple enough. And currently, whenever we got a bug
> report, we may need fix them twice, for kexec_load and kexec_file_load.
> If kexec_file_load is made by default, e.g on x86_64, we will change it
> in kernel space only, for kexec_file_load. This is what I meant about
> 'obsolete gradually'. I think for arm64, s390, they will do these too.
> Unless there's some critical/blocker bug in kexec_load, to corrupt the
> old kexec_load interface in old product.

Maybe.  The code that kexec_file_load sucked into the kernel is quite
stable and rarely needs changes except during a port of kexec to
another architecture.

Last I looked the real maintenance effor of kexec and kexec on panic was
in the drivers.  So I don't think we can use maintenance to do anything.

> For 3), people can still use kexec_load and develop/fix for it, if no
> kexec_file_load supported. But 32-bit arm should be a different one,
> more like i386, we will leave it as is, and fix anything which could
> break it. But people really expects to improve or add feature to it? E.g
> in this patchset, the mem hotplug issue James raised, I assume James is
> focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> another reply, people even don't agree to continue supporting memory
> hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> bug on i386 with a patch, but people would rather set it as BROKEN.

For memory hotplug just reload.  Userspace already gets good events.

We should not expect anything except a panic kernel to be loaded over a
memory hotplug event. The kexec on panic code should actually be loaded
in a location that we don't reliquish if asked for it.

Quite frankly at this point I would love to see the signature fad die,
which would allow us to remove kexec_file_load.  I still have not seen
the signature code used anywhere except by people anticipating trouble.

Given that Microsoft has already directly signed a malicous bootloader.
(Not in the Linux ecosystem).  I don't even know if any of the reasons
for having kexec_file_load are legtimate.


If someone wants to do the work and ensure everything that is possible
to load with kexec_load is possible to load with kexec_file_load.
Kernels supporting the multi-boot protocol etc.  Then we can consider
deprecating kexec_load.


I think it took me about 15 years to remove the sysctl system call and
it only ever had about 10 users.  If you want to go through that kind of
work to make certain there are no more users and that everything they
could do with the old interface is doable with the new interface then
please be my guest.  Until then we need to fully support kexec_load.

Eric
Andrew Morton April 13, 2020, 11:01 p.m. UTC | #24
On Mon, 13 Apr 2020 08:15:23 -0500 ebiederm@xmission.com (Eric W. Biederman) wrote:

> > For 3), people can still use kexec_load and develop/fix for it, if no
> > kexec_file_load supported. But 32-bit arm should be a different one,
> > more like i386, we will leave it as is, and fix anything which could
> > break it. But people really expects to improve or add feature to it? E.g
> > in this patchset, the mem hotplug issue James raised, I assume James is
> > focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> > another reply, people even don't agree to continue supporting memory
> > hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> > bug on i386 with a patch, but people would rather set it as BROKEN.
> 
> For memory hotplug just reload.  Userspace already gets good events.
> 
> We should not expect anything except a panic kernel to be loaded over a
> memory hotplug event. The kexec on panic code should actually be loaded
> in a location that we don't reliquish if asked for it.

Is that a nack for James's patchset?
Eric W. Biederman April 14, 2020, 6:13 a.m. UTC | #25
Andrew Morton <akpm@linux-foundation.org> writes:

> On Mon, 13 Apr 2020 08:15:23 -0500 ebiederm@xmission.com (Eric W. Biederman) wrote:
>
>> > For 3), people can still use kexec_load and develop/fix for it, if no
>> > kexec_file_load supported. But 32-bit arm should be a different one,
>> > more like i386, we will leave it as is, and fix anything which could
>> > break it. But people really expects to improve or add feature to it? E.g
>> > in this patchset, the mem hotplug issue James raised, I assume James is
>> > focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
>> > another reply, people even don't agree to continue supporting memory
>> > hotplug on 32-bit system. We ever took effort to fix a memory hotplug
>> > bug on i386 with a patch, but people would rather set it as BROKEN.
>> 
>> For memory hotplug just reload.  Userspace already gets good events.
>> 
>> We should not expect anything except a panic kernel to be loaded over a
>> memory hotplug event. The kexec on panic code should actually be loaded
>> in a location that we don't reliquish if asked for it.
>
> Is that a nack for James's patchset?

I have just read the end of the thread and I have the sense that the
patchset had already been rejected.  I will see if I can go back and
read the beginning.

I was mostly reacting to the idea that you could stop maintaining an
interface that people are actively using because there is a newer
interface.

Eric
Baoquan He April 14, 2020, 6:40 a.m. UTC | #26
On 04/13/20 at 08:15am, Eric W. Biederman wrote:
> Baoquan He <bhe@redhat.com> writes:
> 
> > On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> >> 
> >> The only benefit of kexec_file_load is that it is simple enough from a
> >> kernel perspective that signatures can be checked.
> >
> > We don't have this restriction any more with below commit:
> >
> > commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> > and KEXEC_SIG_FORCE")
> >
> > With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> > secure boot or legacy system for kexec/kdump. Being simple enough is
> > enough to astract and convince us to use it instead. And kexec_file_load
> > has been in use for several years on systems with secure boot, since
> > added in 2014, on x86_64.
> 
> No.  Actaully kexec_file_load is the less capable interface, and less
> flexible interface.  Which is why it is appropriate for signature
> verification.

Well, everyone has a stance and the corresponding view. You could have
wider view from long time maintenance and in upstrem position, and think
kexec_file_load is horrible. But I can only see from our work as a front
line engineer to maintain/develop kexec/kdump in RHEL, and think
kexec_file_load is easier to maintain.

Surely except of multiple kernel image format support. No matter it is
kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
This is produced from kerel building by default. We have no way to
support it in our distros and add it into kexec_file_load.

[RFC PATCH] x86/boot: make ELF kernel multiboot-able
https://lkml.org/lkml/2017/2/15/654

> 
> >> kexec_load in every other respect is the more capable and functional
> >> interface.  It makes no sense to get rid of it.
> >> 
> >> It does make sense to reload with a loaded kernel on memory hotplug.
> >> That is simple and easy.  If we are going to handle something in the
> >> kernel it should simple an automated unloading of the kernel on memory
> >> hotplug.
> >> 
> >> 
> >> I think it would be irresponsible to deprecate kexec_load on any
> >> platform.
> >> 
> >> I also suspect that kexec_file_load could be taught to copy the dtb
> >> on arm32 if someone wants to deal with signatures.
> >> 
> >> We definitely can not even think of deprecating kexec_load until
> >> architecture that supports it also supports kexec_file_load and everyone
> >> is happy with that interface.  That is Linus's no regression rule.
> >
> > I should pick a milder word to express our tendency and tell our plan
> > then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> > much. I didn't mean to say 'deprecate' at all when replied.
> >
> > The situation and trend I understand about kexec_load and kexec_file_load
> > are:
> >
> > 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> > have yet, just as x86_64, arm64 and s390 have done;
> >  
> > 2) kexec_file_load is suggested to use, and take precedence over
> > kexec_load in the future, if both are supported in one ARCH.
> 
> The deep problem is that kexec_file_load is distinctly less expressive
> than kexec_load.
> 
> > 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> > and by ARCHes for back compatibility w/ kexec_file_load support.
> >
> > For 1) and 2), I think the reason is obvious as Eric said,
> > kexec_file_load is simple enough. And currently, whenever we got a bug
> > report, we may need fix them twice, for kexec_load and kexec_file_load.
> > If kexec_file_load is made by default, e.g on x86_64, we will change it
> > in kernel space only, for kexec_file_load. This is what I meant about
> > 'obsolete gradually'. I think for arm64, s390, they will do these too.
> > Unless there's some critical/blocker bug in kexec_load, to corrupt the
> > old kexec_load interface in old product.
> 
> Maybe.  The code that kexec_file_load sucked into the kernel is quite
> stable and rarely needs changes except during a port of kexec to
> another architecture.
> 
> Last I looked the real maintenance effor of kexec and kexec on panic was
> in the drivers.  So I don't think we can use maintenance to do anything.

Not sure if I got it. But if check Lianbo's patches, a lot of effort has
been taken to make SEV work well on kexec_file_load. And we have
switched to use kexec_file_load in the newly published  Fedora release
on x86_64 by default. Before this, Lianbo has investigated and done many
experiments to make sure the switching is safe. We finally made this
decision. Next we will do the switch in Enterprise distros. Once these
are proved safe, we will suggest customers to use kexec_file_load for
kexec rebooting too. In the future, we will only care about
kexec_file_load if everying is going well. But as I have explained
repeatedly, only caring about kexec_file_load means we will leave
kexec_load as is, we will not add new feature or improvement patches
for it.

commit 6a20bd54473e11011bf2b47efb52d0759d412854
Author: Lianbo Jiang <lijiang@redhat.com>
Date:   Thu Jan 16 13:47:35 2020 +0800

    kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default

> 
> > For 3), people can still use kexec_load and develop/fix for it, if no
> > kexec_file_load supported. But 32-bit arm should be a different one,
> > more like i386, we will leave it as is, and fix anything which could
> > break it. But people really expects to improve or add feature to it? E.g
> > in this patchset, the mem hotplug issue James raised, I assume James is
> > focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> > another reply, people even don't agree to continue supporting memory
> > hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> > bug on i386 with a patch, but people would rather set it as BROKEN.
> 
> For memory hotplug just reload.  Userspace already gets good events.

Kexec_file_load is easy to maintain. This is an example.

Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
it's obviously not right. We can't disable memory hotplug just because
kexec-ed kernel is loaded ahead of time. 

Reloading is also not a good fix. Kexec-ed kernel is targeted at a
movable area, reloading can avoid kexec rebooting corruption if that
area is hot removed. But if that area is not removed, locating kernel
into the hotpluggable area will change the area into ummovable zone.
Unless we decide to not support memory hotplug in kexec-ed kernel, I
guess it's very hard. Now in our distros kexec rebooting has been
supported, the big cloud providers are deploying linux in guest, bugs on
kexec reboot failure has been reported. They need the memory hotplug to
increase/decrease memory.

The root cause is kexec-ed kernel is targeted at hotpluggable memory
region. Just avoiding the movable area can fix it. In kexec_file_load(),
just checking or picking those unmovable region to put kernel/initrd in
function locate_mem_hole_callback() can fix it. The page or pageblock's
zone is movable or not, it's easy to know. This fix doesn't need to
bother other component.

> 
> We should not expect anything except a panic kernel to be loaded over a
> memory hotplug event. The kexec on panic code should actually be loaded
> in a location that we don't reliquish if asked for it.
> 
> Quite frankly at this point I would love to see the signature fad die,
> which would allow us to remove kexec_file_load.  I still have not seen
> the signature code used anywhere except by people anticipating trouble.
> 
> Given that Microsoft has already directly signed a malicous bootloader.
> (Not in the Linux ecosystem).  I don't even know if any of the reasons
> for having kexec_file_load are legtimate.
> 
> 
> If someone wants to do the work and ensure everything that is possible
> to load with kexec_load is possible to load with kexec_file_load.
> Kernels supporting the multi-boot protocol etc.  Then we can consider
> deprecating kexec_load.
> 
> 
> I think it took me about 15 years to remove the sysctl system call and
> it only ever had about 10 users.  If you want to go through that kind of
> work to make certain there are no more users and that everything they
> could do with the old interface is doable with the new interface then
> please be my guest.  Until then we need to fully support kexec_load.

I want to clarify again, we have no plan to deprecate kexec_load.
We just plan to use kexec_file_load more in our distros, for both legacy
system or system with secure boot.

Eric, I am glad to see you told your opinion about kexec_file_load.
Without the discussion in this thread, we may not know it. So I have one
question, seems kexec_file_load will continue existing, the ARCHes our
distros is supporting, x86_64, s390, ppc, arm64, all have kexec_file_load,
do you object us to continue using kexec_file_load, for signature
verification and normal kexec/kdump booting? Or you plan to deprecate
kexec_file_load?
Baoquan He April 14, 2020, 6:51 a.m. UTC | #27
On 04/14/20 at 02:40pm, Baoquan He wrote:
> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
> > Baoquan He <bhe@redhat.com> writes:
> > 
> > > On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> > >> 
> > >> The only benefit of kexec_file_load is that it is simple enough from a
> > >> kernel perspective that signatures can be checked.
> > >
> > > We don't have this restriction any more with below commit:
> > >
> > > commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> > > and KEXEC_SIG_FORCE")
> > >
> > > With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> > > secure boot or legacy system for kexec/kdump. Being simple enough is
> > > enough to astract and convince us to use it instead. And kexec_file_load
> > > has been in use for several years on systems with secure boot, since
> > > added in 2014, on x86_64.
> > 
> > No.  Actaully kexec_file_load is the less capable interface, and less
> > flexible interface.  Which is why it is appropriate for signature
> > verification.
> 
> Well, everyone has a stance and the corresponding view. You could have
> wider view from long time maintenance and in upstrem position, and think
> kexec_file_load is horrible. But I can only see from our work as a front
> line engineer to maintain/develop kexec/kdump in RHEL, and think
> kexec_file_load is easier to maintain.
> 
> Surely except of multiple kernel image format support. No matter it is
> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
> This is produced from kerel building by default. We have no way to
> support it in our distros and add it into kexec_file_load.
> 
> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> https://lkml.org/lkml/2017/2/15/654
> 
> > 
> > >> kexec_load in every other respect is the more capable and functional
> > >> interface.  It makes no sense to get rid of it.
> > >> 
> > >> It does make sense to reload with a loaded kernel on memory hotplug.
> > >> That is simple and easy.  If we are going to handle something in the
> > >> kernel it should simple an automated unloading of the kernel on memory
> > >> hotplug.
> > >> 
> > >> 
> > >> I think it would be irresponsible to deprecate kexec_load on any
> > >> platform.
> > >> 
> > >> I also suspect that kexec_file_load could be taught to copy the dtb
> > >> on arm32 if someone wants to deal with signatures.
> > >> 
> > >> We definitely can not even think of deprecating kexec_load until
> > >> architecture that supports it also supports kexec_file_load and everyone
> > >> is happy with that interface.  That is Linus's no regression rule.
> > >
> > > I should pick a milder word to express our tendency and tell our plan
> > > then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> > > much. I didn't mean to say 'deprecate' at all when replied.
> > >
> > > The situation and trend I understand about kexec_load and kexec_file_load
> > > are:
> > >
> > > 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> > > have yet, just as x86_64, arm64 and s390 have done;
> > >  
> > > 2) kexec_file_load is suggested to use, and take precedence over
> > > kexec_load in the future, if both are supported in one ARCH.
> > 
> > The deep problem is that kexec_file_load is distinctly less expressive
> > than kexec_load.
> > 
> > > 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> > > and by ARCHes for back compatibility w/ kexec_file_load support.
> > >
> > > For 1) and 2), I think the reason is obvious as Eric said,
> > > kexec_file_load is simple enough. And currently, whenever we got a bug
> > > report, we may need fix them twice, for kexec_load and kexec_file_load.
> > > If kexec_file_load is made by default, e.g on x86_64, we will change it
> > > in kernel space only, for kexec_file_load. This is what I meant about
> > > 'obsolete gradually'. I think for arm64, s390, they will do these too.
> > > Unless there's some critical/blocker bug in kexec_load, to corrupt the
> > > old kexec_load interface in old product.
> > 
> > Maybe.  The code that kexec_file_load sucked into the kernel is quite
> > stable and rarely needs changes except during a port of kexec to
> > another architecture.
> > 
> > Last I looked the real maintenance effor of kexec and kexec on panic was
> > in the drivers.  So I don't think we can use maintenance to do anything.
> 
> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
> been taken to make SEV work well on kexec_file_load. And we have
> switched to use kexec_file_load in the newly published  Fedora release
> on x86_64 by default. Before this, Lianbo has investigated and done many
> experiments to make sure the switching is safe. We finally made this
> decision. Next we will do the switch in Enterprise distros. Once these
> are proved safe, we will suggest customers to use kexec_file_load for
> kexec rebooting too. In the future, we will only care about
> kexec_file_load if everying is going well. But as I have explained
> repeatedly, only caring about kexec_file_load means we will leave
> kexec_load as is, we will not add new feature or improvement patches
> for it.
> 
> commit 6a20bd54473e11011bf2b47efb52d0759d412854
> Author: Lianbo Jiang <lijiang@redhat.com>
> Date:   Thu Jan 16 13:47:35 2020 +0800
> 
>     kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default
> 
> > 
> > > For 3), people can still use kexec_load and develop/fix for it, if no
> > > kexec_file_load supported. But 32-bit arm should be a different one,
> > > more like i386, we will leave it as is, and fix anything which could
> > > break it. But people really expects to improve or add feature to it? E.g
> > > in this patchset, the mem hotplug issue James raised, I assume James is
> > > focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> > > another reply, people even don't agree to continue supporting memory
> > > hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> > > bug on i386 with a patch, but people would rather set it as BROKEN.
> > 
> > For memory hotplug just reload.  Userspace already gets good events.
> 
> Kexec_file_load is easy to maintain. This is an example.
> 
> Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
> it's obviously not right. We can't disable memory hotplug just because
> kexec-ed kernel is loaded ahead of time. 
> 
> Reloading is also not a good fix. Kexec-ed kernel is targeted at a
> movable area, reloading can avoid kexec rebooting corruption if that
> area is hot removed. But if that area is not removed, locating kernel
> into the hotpluggable area will change the area into ummovable zone.

Here I mean if kexec kernel is targeted at a hotplggable memory region,
after kexec rebooting, that region will become unmovable. People can't
hot remove it in kexec-ed kernel.

> Unless we decide to not support memory hotplug in kexec-ed kernel, I
> guess it's very hard. Now in our distros kexec rebooting has been
> supported, the big cloud providers are deploying linux in guest, bugs on
> kexec reboot failure has been reported. They need the memory hotplug to
> increase/decrease memory.
> 
> The root cause is kexec-ed kernel is targeted at hotpluggable memory
> region. Just avoiding the movable area can fix it. In kexec_file_load(),
> just checking or picking those unmovable region to put kernel/initrd in
> function locate_mem_hole_callback() can fix it. The page or pageblock's
> zone is movable or not, it's easy to know. This fix doesn't need to
> bother other component.
> 
> > 
> > We should not expect anything except a panic kernel to be loaded over a
> > memory hotplug event. The kexec on panic code should actually be loaded
> > in a location that we don't reliquish if asked for it.
> > 
> > Quite frankly at this point I would love to see the signature fad die,
> > which would allow us to remove kexec_file_load.  I still have not seen
> > the signature code used anywhere except by people anticipating trouble.
> > 
> > Given that Microsoft has already directly signed a malicous bootloader.
> > (Not in the Linux ecosystem).  I don't even know if any of the reasons
> > for having kexec_file_load are legtimate.
> > 
> > 
> > If someone wants to do the work and ensure everything that is possible
> > to load with kexec_load is possible to load with kexec_file_load.
> > Kernels supporting the multi-boot protocol etc.  Then we can consider
> > deprecating kexec_load.
> > 
> > 
> > I think it took me about 15 years to remove the sysctl system call and
> > it only ever had about 10 users.  If you want to go through that kind of
> > work to make certain there are no more users and that everything they
> > could do with the old interface is doable with the new interface then
> > please be my guest.  Until then we need to fully support kexec_load.
> 
> I want to clarify again, we have no plan to deprecate kexec_load.
> We just plan to use kexec_file_load more in our distros, for both legacy
> system or system with secure boot.
> 
> Eric, I am glad to see you told your opinion about kexec_file_load.
> Without the discussion in this thread, we may not know it. So I have one
> question, seems kexec_file_load will continue existing, the ARCHes our
> distros is supporting, x86_64, s390, ppc, arm64, all have kexec_file_load,
> do you object us to continue using kexec_file_load, for signature
> verification and normal kexec/kdump booting? Or you plan to deprecate
> kexec_file_load?
David Hildenbrand April 14, 2020, 7:05 a.m. UTC | #28
On 10.04.20 21:10, Andrew Morton wrote:
> It's unclear (to me) what is the status of this patchset.  But it does appear that
> an new version can be expected?
> 

I'd suggest to unqueue the patches until we have a consensus.

While there are a couple of ideas floating around here, my current
suggestion would be either

1. Indicate all hotplugged memory as "System RAM (hotplugged)" in
/proc/iomem and the firmware memmap (on all architectures). This will
require kexec changes, but I would have assume that kexec has to be
updated in lock-step with the kernel just like e.g., makedumpfile.
Modify kexec() to not place the kexec kernel on these areas (easy) but
still consider them as crash regions to dump. When loading a kexec
kernel, validate in the kernel that the memory is appropriate.

2. Make kexec() reload the the kernel whenever we e.g., get a udev event
for removal of memory in /sys/devices/system/memory/. On every
remove_memory(), invalidate the loaded kernel in the kernel.


As I mentioned somewhere, 1. will be interesting for virtio-mem, where
we don't want any kexec kernel to be placed on virtio-mem-added memory.
David Hildenbrand April 14, 2020, 8 a.m. UTC | #29
On 14.04.20 08:40, Baoquan He wrote:
> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
>> Baoquan He <bhe@redhat.com> writes:
>>
>>> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
>>>>
>>>> The only benefit of kexec_file_load is that it is simple enough from a
>>>> kernel perspective that signatures can be checked.
>>>
>>> We don't have this restriction any more with below commit:
>>>
>>> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
>>> and KEXEC_SIG_FORCE")
>>>
>>> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
>>> secure boot or legacy system for kexec/kdump. Being simple enough is
>>> enough to astract and convince us to use it instead. And kexec_file_load
>>> has been in use for several years on systems with secure boot, since
>>> added in 2014, on x86_64.
>>
>> No.  Actaully kexec_file_load is the less capable interface, and less
>> flexible interface.  Which is why it is appropriate for signature
>> verification.
> 
> Well, everyone has a stance and the corresponding view. You could have
> wider view from long time maintenance and in upstrem position, and think
> kexec_file_load is horrible. But I can only see from our work as a front
> line engineer to maintain/develop kexec/kdump in RHEL, and think
> kexec_file_load is easier to maintain.
> 
> Surely except of multiple kernel image format support. No matter it is
> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
> This is produced from kerel building by default. We have no way to
> support it in our distros and add it into kexec_file_load.
> 
> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> https://lkml.org/lkml/2017/2/15/654
> 
>>
>>>> kexec_load in every other respect is the more capable and functional
>>>> interface.  It makes no sense to get rid of it.
>>>>
>>>> It does make sense to reload with a loaded kernel on memory hotplug.
>>>> That is simple and easy.  If we are going to handle something in the
>>>> kernel it should simple an automated unloading of the kernel on memory
>>>> hotplug.
>>>>
>>>>
>>>> I think it would be irresponsible to deprecate kexec_load on any
>>>> platform.
>>>>
>>>> I also suspect that kexec_file_load could be taught to copy the dtb
>>>> on arm32 if someone wants to deal with signatures.
>>>>
>>>> We definitely can not even think of deprecating kexec_load until
>>>> architecture that supports it also supports kexec_file_load and everyone
>>>> is happy with that interface.  That is Linus's no regression rule.
>>>
>>> I should pick a milder word to express our tendency and tell our plan
>>> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
>>> much. I didn't mean to say 'deprecate' at all when replied.
>>>
>>> The situation and trend I understand about kexec_load and kexec_file_load
>>> are:
>>>
>>> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
>>> have yet, just as x86_64, arm64 and s390 have done;
>>>  
>>> 2) kexec_file_load is suggested to use, and take precedence over
>>> kexec_load in the future, if both are supported in one ARCH.
>>
>> The deep problem is that kexec_file_load is distinctly less expressive
>> than kexec_load.
>>
>>> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
>>> and by ARCHes for back compatibility w/ kexec_file_load support.
>>>
>>> For 1) and 2), I think the reason is obvious as Eric said,
>>> kexec_file_load is simple enough. And currently, whenever we got a bug
>>> report, we may need fix them twice, for kexec_load and kexec_file_load.
>>> If kexec_file_load is made by default, e.g on x86_64, we will change it
>>> in kernel space only, for kexec_file_load. This is what I meant about
>>> 'obsolete gradually'. I think for arm64, s390, they will do these too.
>>> Unless there's some critical/blocker bug in kexec_load, to corrupt the
>>> old kexec_load interface in old product.
>>
>> Maybe.  The code that kexec_file_load sucked into the kernel is quite
>> stable and rarely needs changes except during a port of kexec to
>> another architecture.
>>
>> Last I looked the real maintenance effor of kexec and kexec on panic was
>> in the drivers.  So I don't think we can use maintenance to do anything.
> 
> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
> been taken to make SEV work well on kexec_file_load. And we have
> switched to use kexec_file_load in the newly published  Fedora release
> on x86_64 by default. Before this, Lianbo has investigated and done many
> experiments to make sure the switching is safe. We finally made this
> decision. Next we will do the switch in Enterprise distros. Once these
> are proved safe, we will suggest customers to use kexec_file_load for
> kexec rebooting too. In the future, we will only care about
> kexec_file_load if everying is going well. But as I have explained
> repeatedly, only caring about kexec_file_load means we will leave
> kexec_load as is, we will not add new feature or improvement patches
> for it.
> 
> commit 6a20bd54473e11011bf2b47efb52d0759d412854
> Author: Lianbo Jiang <lijiang@redhat.com>
> Date:   Thu Jan 16 13:47:35 2020 +0800
> 
>     kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default
> 
>>
>>> For 3), people can still use kexec_load and develop/fix for it, if no
>>> kexec_file_load supported. But 32-bit arm should be a different one,
>>> more like i386, we will leave it as is, and fix anything which could
>>> break it. But people really expects to improve or add feature to it? E.g
>>> in this patchset, the mem hotplug issue James raised, I assume James is
>>> focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
>>> another reply, people even don't agree to continue supporting memory
>>> hotplug on 32-bit system. We ever took effort to fix a memory hotplug
>>> bug on i386 with a patch, but people would rather set it as BROKEN.
>>
>> For memory hotplug just reload.  Userspace already gets good events.
> 
> Kexec_file_load is easy to maintain. This is an example.
> 
> Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
> it's obviously not right. We can't disable memory hotplug just because
> kexec-ed kernel is loaded ahead of time. 
> 
> Reloading is also not a good fix. Kexec-ed kernel is targeted at a
> movable area, reloading can avoid kexec rebooting corruption if that
> area is hot removed. But if that area is not removed, locating kernel
> into the hotpluggable area will change the area into ummovable zone.
> Unless we decide to not support memory hotplug in kexec-ed kernel, I
> guess it's very hard. Now in our distros kexec rebooting has been
> supported, the big cloud providers are deploying linux in guest, bugs on
> kexec reboot failure has been reported. They need the memory hotplug to
> increase/decrease memory.
> 
> The root cause is kexec-ed kernel is targeted at hotpluggable memory
> region. Just avoiding the movable area can fix it. In kexec_file_load(),
> just checking or picking those unmovable region to put kernel/initrd in
> function locate_mem_hole_callback() can fix it. The page or pageblock's
> zone is movable or not, it's easy to know. This fix doesn't need to
> bother other component.

I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
does not imply that it cannot get offlined and removed e.g., this is
heavily used on ppc64, with 16MB sections.
Dave Young April 14, 2020, 9:16 a.m. UTC | #30
On 04/13/20 at 08:15am, Eric W. Biederman wrote:
> Baoquan He <bhe@redhat.com> writes:
> 
> > On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> >> 
> >> The only benefit of kexec_file_load is that it is simple enough from a
> >> kernel perspective that signatures can be checked.
> >
> > We don't have this restriction any more with below commit:
> >
> > commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> > and KEXEC_SIG_FORCE")
> >
> > With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> > secure boot or legacy system for kexec/kdump. Being simple enough is
> > enough to astract and convince us to use it instead. And kexec_file_load
> > has been in use for several years on systems with secure boot, since
> > added in 2014, on x86_64.
> 
> No.  Actaully kexec_file_load is the less capable interface, and less
> flexible interface.  Which is why it is appropriate for signature
> verification.

I agreed that the user space design is more flexible,  but as for the
common use case of loading bzImage (say x86 as an example) the
kexec_file_load is good enough.   We could have other potential improvement based
on kexec_file_load.  For example we could use it to do some early kdump
loading,  eg. try to load an attached kdump kernel immediately once
the crashkernel memory get reserved.

> 
> >> kexec_load in every other respect is the more capable and functional
> >> interface.  It makes no sense to get rid of it.

We do not remove kexec_load at all, it is indeed helpful in many cases
as all agreed.  But if we have a bug reported for both we may fix
kexec_file_load first because it is usually easier, also do not need to
worry about too much about old kernel and new kernel compatibility.

For example the recent breakage we found in efi path, kexec_file_load
just work after the efi cleanup, but kexec_load depends on the ABI we
added, so we must fix it as below:
https://lore.kernel.org/linux-efi/20200410135644.GB6772@dhcp-128-65.nay.redhat.com/ 

> >> 
> >> It does make sense to reload with a loaded kernel on memory hotplug.
> >> That is simple and easy.  If we are going to handle something in the
> >> kernel it should simple an automated unloading of the kernel on memory
> >> hotplug.
> >> 
> >> 
> >> I think it would be irresponsible to deprecate kexec_load on any
> >> platform.
> >> 
> >> I also suspect that kexec_file_load could be taught to copy the dtb
> >> on arm32 if someone wants to deal with signatures.
> >> 
> >> We definitely can not even think of deprecating kexec_load until
> >> architecture that supports it also supports kexec_file_load and everyone
> >> is happy with that interface.  That is Linus's no regression rule.
> >
> > I should pick a milder word to express our tendency and tell our plan
> > then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> > much. I didn't mean to say 'deprecate' at all when replied.
> >
> > The situation and trend I understand about kexec_load and kexec_file_load
> > are:
> >
> > 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> > have yet, just as x86_64, arm64 and s390 have done;
> >  
> > 2) kexec_file_load is suggested to use, and take precedence over
> > kexec_load in the future, if both are supported in one ARCH.
> 
> The deep problem is that kexec_file_load is distinctly less expressive
> than kexec_load.
> 
> > 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> > and by ARCHes for back compatibility w/ kexec_file_load support.
> >
> > For 1) and 2), I think the reason is obvious as Eric said,
> > kexec_file_load is simple enough. And currently, whenever we got a bug
> > report, we may need fix them twice, for kexec_load and kexec_file_load.
> > If kexec_file_load is made by default, e.g on x86_64, we will change it
> > in kernel space only, for kexec_file_load. This is what I meant about
> > 'obsolete gradually'. I think for arm64, s390, they will do these too.
> > Unless there's some critical/blocker bug in kexec_load, to corrupt the
> > old kexec_load interface in old product.
> 
> Maybe.  The code that kexec_file_load sucked into the kernel is quite
> stable and rarely needs changes except during a port of kexec to
> another architecture.
> 
> Last I looked the real maintenance effor of kexec and kexec on panic was
> in the drivers.  So I don't think we can use maintenance to do anything.
> 
> > For 3), people can still use kexec_load and develop/fix for it, if no
> > kexec_file_load supported. But 32-bit arm should be a different one,
> > more like i386, we will leave it as is, and fix anything which could
> > break it. But people really expects to improve or add feature to it? E.g
> > in this patchset, the mem hotplug issue James raised, I assume James is
> > focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> > another reply, people even don't agree to continue supporting memory
> > hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> > bug on i386 with a patch, but people would rather set it as BROKEN.
> 
> For memory hotplug just reload.  Userspace already gets good events.
> 
> We should not expect anything except a panic kernel to be loaded over a
> memory hotplug event. The kexec on panic code should actually be loaded
> in a location that we don't reliquish if asked for it.
> 
> Quite frankly at this point I would love to see the signature fad die,
> which would allow us to remove kexec_file_load.  I still have not seen
> the signature code used anywhere except by people anticipating trouble.

Same to me, I also hate the Secure Boot, and I also do not like the
trouble added by signature verification.   But still we found that
beyond of Secure Boot use cases it is also useful in other usual cases.

And since kernel has the lockdown supported we have to leave with it.

Thanks
Dave
Baoquan He April 14, 2020, 9:22 a.m. UTC | #31
On 04/14/20 at 10:00am, David Hildenbrand wrote:
> On 14.04.20 08:40, Baoquan He wrote:
> > On 04/13/20 at 08:15am, Eric W. Biederman wrote:
> >> Baoquan He <bhe@redhat.com> writes:
> >>
> >>> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> >>>>
> >>>> The only benefit of kexec_file_load is that it is simple enough from a
> >>>> kernel perspective that signatures can be checked.
> >>>
> >>> We don't have this restriction any more with below commit:
> >>>
> >>> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> >>> and KEXEC_SIG_FORCE")
> >>>
> >>> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> >>> secure boot or legacy system for kexec/kdump. Being simple enough is
> >>> enough to astract and convince us to use it instead. And kexec_file_load
> >>> has been in use for several years on systems with secure boot, since
> >>> added in 2014, on x86_64.
> >>
> >> No.  Actaully kexec_file_load is the less capable interface, and less
> >> flexible interface.  Which is why it is appropriate for signature
> >> verification.
> > 
> > Well, everyone has a stance and the corresponding view. You could have
> > wider view from long time maintenance and in upstrem position, and think
> > kexec_file_load is horrible. But I can only see from our work as a front
> > line engineer to maintain/develop kexec/kdump in RHEL, and think
> > kexec_file_load is easier to maintain.
> > 
> > Surely except of multiple kernel image format support. No matter it is
> > kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
> > This is produced from kerel building by default. We have no way to
> > support it in our distros and add it into kexec_file_load.
> > 
> > [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> > https://lkml.org/lkml/2017/2/15/654
> > 
> >>
> >>>> kexec_load in every other respect is the more capable and functional
> >>>> interface.  It makes no sense to get rid of it.
> >>>>
> >>>> It does make sense to reload with a loaded kernel on memory hotplug.
> >>>> That is simple and easy.  If we are going to handle something in the
> >>>> kernel it should simple an automated unloading of the kernel on memory
> >>>> hotplug.
> >>>>
> >>>>
> >>>> I think it would be irresponsible to deprecate kexec_load on any
> >>>> platform.
> >>>>
> >>>> I also suspect that kexec_file_load could be taught to copy the dtb
> >>>> on arm32 if someone wants to deal with signatures.
> >>>>
> >>>> We definitely can not even think of deprecating kexec_load until
> >>>> architecture that supports it also supports kexec_file_load and everyone
> >>>> is happy with that interface.  That is Linus's no regression rule.
> >>>
> >>> I should pick a milder word to express our tendency and tell our plan
> >>> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> >>> much. I didn't mean to say 'deprecate' at all when replied.
> >>>
> >>> The situation and trend I understand about kexec_load and kexec_file_load
> >>> are:
> >>>
> >>> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> >>> have yet, just as x86_64, arm64 and s390 have done;
> >>>  
> >>> 2) kexec_file_load is suggested to use, and take precedence over
> >>> kexec_load in the future, if both are supported in one ARCH.
> >>
> >> The deep problem is that kexec_file_load is distinctly less expressive
> >> than kexec_load.
> >>
> >>> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> >>> and by ARCHes for back compatibility w/ kexec_file_load support.
> >>>
> >>> For 1) and 2), I think the reason is obvious as Eric said,
> >>> kexec_file_load is simple enough. And currently, whenever we got a bug
> >>> report, we may need fix them twice, for kexec_load and kexec_file_load.
> >>> If kexec_file_load is made by default, e.g on x86_64, we will change it
> >>> in kernel space only, for kexec_file_load. This is what I meant about
> >>> 'obsolete gradually'. I think for arm64, s390, they will do these too.
> >>> Unless there's some critical/blocker bug in kexec_load, to corrupt the
> >>> old kexec_load interface in old product.
> >>
> >> Maybe.  The code that kexec_file_load sucked into the kernel is quite
> >> stable and rarely needs changes except during a port of kexec to
> >> another architecture.
> >>
> >> Last I looked the real maintenance effor of kexec and kexec on panic was
> >> in the drivers.  So I don't think we can use maintenance to do anything.
> > 
> > Not sure if I got it. But if check Lianbo's patches, a lot of effort has
> > been taken to make SEV work well on kexec_file_load. And we have
> > switched to use kexec_file_load in the newly published  Fedora release
> > on x86_64 by default. Before this, Lianbo has investigated and done many
> > experiments to make sure the switching is safe. We finally made this
> > decision. Next we will do the switch in Enterprise distros. Once these
> > are proved safe, we will suggest customers to use kexec_file_load for
> > kexec rebooting too. In the future, we will only care about
> > kexec_file_load if everying is going well. But as I have explained
> > repeatedly, only caring about kexec_file_load means we will leave
> > kexec_load as is, we will not add new feature or improvement patches
> > for it.
> > 
> > commit 6a20bd54473e11011bf2b47efb52d0759d412854
> > Author: Lianbo Jiang <lijiang@redhat.com>
> > Date:   Thu Jan 16 13:47:35 2020 +0800
> > 
> >     kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default
> > 
> >>
> >>> For 3), people can still use kexec_load and develop/fix for it, if no
> >>> kexec_file_load supported. But 32-bit arm should be a different one,
> >>> more like i386, we will leave it as is, and fix anything which could
> >>> break it. But people really expects to improve or add feature to it? E.g
> >>> in this patchset, the mem hotplug issue James raised, I assume James is
> >>> focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> >>> another reply, people even don't agree to continue supporting memory
> >>> hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> >>> bug on i386 with a patch, but people would rather set it as BROKEN.
> >>
> >> For memory hotplug just reload.  Userspace already gets good events.
> > 
> > Kexec_file_load is easy to maintain. This is an example.
> > 
> > Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
> > it's obviously not right. We can't disable memory hotplug just because
> > kexec-ed kernel is loaded ahead of time. 
> > 
> > Reloading is also not a good fix. Kexec-ed kernel is targeted at a
> > movable area, reloading can avoid kexec rebooting corruption if that
> > area is hot removed. But if that area is not removed, locating kernel
> > into the hotpluggable area will change the area into ummovable zone.
> > Unless we decide to not support memory hotplug in kexec-ed kernel, I
> > guess it's very hard. Now in our distros kexec rebooting has been
> > supported, the big cloud providers are deploying linux in guest, bugs on
> > kexec reboot failure has been reported. They need the memory hotplug to
> > increase/decrease memory.
> > 
> > The root cause is kexec-ed kernel is targeted at hotpluggable memory
> > region. Just avoiding the movable area can fix it. In kexec_file_load(),
> > just checking or picking those unmovable region to put kernel/initrd in
> > function locate_mem_hole_callback() can fix it. The page or pageblock's
> > zone is movable or not, it's easy to know. This fix doesn't need to
> > bother other component.
> 
> I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
> does not imply that it cannot get offlined and removed e.g., this is
> heavily used on ppc64, with 16MB sections.

Really? I just know there are two kinds of mem hoplug in ppc, but don't
know the details. So in this case, is there any flag or a way to know
those memory block are hotpluggable? I am curious how those kernel data
is avoided to be put in this area. Or ppc just freely uses it for kernel
data or user space data, then try to migrate when hot remove?
David Hildenbrand April 14, 2020, 9:37 a.m. UTC | #32
On 14.04.20 11:22, Baoquan He wrote:
> On 04/14/20 at 10:00am, David Hildenbrand wrote:
>> On 14.04.20 08:40, Baoquan He wrote:
>>> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
>>>> Baoquan He <bhe@redhat.com> writes:
>>>>
>>>>> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
>>>>>>
>>>>>> The only benefit of kexec_file_load is that it is simple enough from a
>>>>>> kernel perspective that signatures can be checked.
>>>>>
>>>>> We don't have this restriction any more with below commit:
>>>>>
>>>>> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
>>>>> and KEXEC_SIG_FORCE")
>>>>>
>>>>> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
>>>>> secure boot or legacy system for kexec/kdump. Being simple enough is
>>>>> enough to astract and convince us to use it instead. And kexec_file_load
>>>>> has been in use for several years on systems with secure boot, since
>>>>> added in 2014, on x86_64.
>>>>
>>>> No.  Actaully kexec_file_load is the less capable interface, and less
>>>> flexible interface.  Which is why it is appropriate for signature
>>>> verification.
>>>
>>> Well, everyone has a stance and the corresponding view. You could have
>>> wider view from long time maintenance and in upstrem position, and think
>>> kexec_file_load is horrible. But I can only see from our work as a front
>>> line engineer to maintain/develop kexec/kdump in RHEL, and think
>>> kexec_file_load is easier to maintain.
>>>
>>> Surely except of multiple kernel image format support. No matter it is
>>> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
>>> This is produced from kerel building by default. We have no way to
>>> support it in our distros and add it into kexec_file_load.
>>>
>>> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
>>> https://lkml.org/lkml/2017/2/15/654
>>>
>>>>
>>>>>> kexec_load in every other respect is the more capable and functional
>>>>>> interface.  It makes no sense to get rid of it.
>>>>>>
>>>>>> It does make sense to reload with a loaded kernel on memory hotplug.
>>>>>> That is simple and easy.  If we are going to handle something in the
>>>>>> kernel it should simple an automated unloading of the kernel on memory
>>>>>> hotplug.
>>>>>>
>>>>>>
>>>>>> I think it would be irresponsible to deprecate kexec_load on any
>>>>>> platform.
>>>>>>
>>>>>> I also suspect that kexec_file_load could be taught to copy the dtb
>>>>>> on arm32 if someone wants to deal with signatures.
>>>>>>
>>>>>> We definitely can not even think of deprecating kexec_load until
>>>>>> architecture that supports it also supports kexec_file_load and everyone
>>>>>> is happy with that interface.  That is Linus's no regression rule.
>>>>>
>>>>> I should pick a milder word to express our tendency and tell our plan
>>>>> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
>>>>> much. I didn't mean to say 'deprecate' at all when replied.
>>>>>
>>>>> The situation and trend I understand about kexec_load and kexec_file_load
>>>>> are:
>>>>>
>>>>> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
>>>>> have yet, just as x86_64, arm64 and s390 have done;
>>>>>  
>>>>> 2) kexec_file_load is suggested to use, and take precedence over
>>>>> kexec_load in the future, if both are supported in one ARCH.
>>>>
>>>> The deep problem is that kexec_file_load is distinctly less expressive
>>>> than kexec_load.
>>>>
>>>>> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
>>>>> and by ARCHes for back compatibility w/ kexec_file_load support.
>>>>>
>>>>> For 1) and 2), I think the reason is obvious as Eric said,
>>>>> kexec_file_load is simple enough. And currently, whenever we got a bug
>>>>> report, we may need fix them twice, for kexec_load and kexec_file_load.
>>>>> If kexec_file_load is made by default, e.g on x86_64, we will change it
>>>>> in kernel space only, for kexec_file_load. This is what I meant about
>>>>> 'obsolete gradually'. I think for arm64, s390, they will do these too.
>>>>> Unless there's some critical/blocker bug in kexec_load, to corrupt the
>>>>> old kexec_load interface in old product.
>>>>
>>>> Maybe.  The code that kexec_file_load sucked into the kernel is quite
>>>> stable and rarely needs changes except during a port of kexec to
>>>> another architecture.
>>>>
>>>> Last I looked the real maintenance effor of kexec and kexec on panic was
>>>> in the drivers.  So I don't think we can use maintenance to do anything.
>>>
>>> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
>>> been taken to make SEV work well on kexec_file_load. And we have
>>> switched to use kexec_file_load in the newly published  Fedora release
>>> on x86_64 by default. Before this, Lianbo has investigated and done many
>>> experiments to make sure the switching is safe. We finally made this
>>> decision. Next we will do the switch in Enterprise distros. Once these
>>> are proved safe, we will suggest customers to use kexec_file_load for
>>> kexec rebooting too. In the future, we will only care about
>>> kexec_file_load if everying is going well. But as I have explained
>>> repeatedly, only caring about kexec_file_load means we will leave
>>> kexec_load as is, we will not add new feature or improvement patches
>>> for it.
>>>
>>> commit 6a20bd54473e11011bf2b47efb52d0759d412854
>>> Author: Lianbo Jiang <lijiang@redhat.com>
>>> Date:   Thu Jan 16 13:47:35 2020 +0800
>>>
>>>     kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default
>>>
>>>>
>>>>> For 3), people can still use kexec_load and develop/fix for it, if no
>>>>> kexec_file_load supported. But 32-bit arm should be a different one,
>>>>> more like i386, we will leave it as is, and fix anything which could
>>>>> break it. But people really expects to improve or add feature to it? E.g
>>>>> in this patchset, the mem hotplug issue James raised, I assume James is
>>>>> focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
>>>>> another reply, people even don't agree to continue supporting memory
>>>>> hotplug on 32-bit system. We ever took effort to fix a memory hotplug
>>>>> bug on i386 with a patch, but people would rather set it as BROKEN.
>>>>
>>>> For memory hotplug just reload.  Userspace already gets good events.
>>>
>>> Kexec_file_load is easy to maintain. This is an example.
>>>
>>> Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
>>> it's obviously not right. We can't disable memory hotplug just because
>>> kexec-ed kernel is loaded ahead of time. 
>>>
>>> Reloading is also not a good fix. Kexec-ed kernel is targeted at a
>>> movable area, reloading can avoid kexec rebooting corruption if that
>>> area is hot removed. But if that area is not removed, locating kernel
>>> into the hotpluggable area will change the area into ummovable zone.
>>> Unless we decide to not support memory hotplug in kexec-ed kernel, I
>>> guess it's very hard. Now in our distros kexec rebooting has been
>>> supported, the big cloud providers are deploying linux in guest, bugs on
>>> kexec reboot failure has been reported. They need the memory hotplug to
>>> increase/decrease memory.
>>>
>>> The root cause is kexec-ed kernel is targeted at hotpluggable memory
>>> region. Just avoiding the movable area can fix it. In kexec_file_load(),
>>> just checking or picking those unmovable region to put kernel/initrd in
>>> function locate_mem_hole_callback() can fix it. The page or pageblock's
>>> zone is movable or not, it's easy to know. This fix doesn't need to
>>> bother other component.
>>
>> I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
>> does not imply that it cannot get offlined and removed e.g., this is
>> heavily used on ppc64, with 16MB sections.
> 
> Really? I just know there are two kinds of mem hoplug in ppc, but don't
> know the details. So in this case, is there any flag or a way to know
> those memory block are hotpluggable? I am curious how those kernel data
> is avoided to be put in this area. Or ppc just freely uses it for kernel
> data or user space data, then try to migrate when hot remove?

See
arch/powerpc/platforms/pseries/hotplug-memory.c:dlpar_memory_remove_by_count()

Under DLAPR, it can remove memory in LMB granularity, which is usually
16MB (== single section on ppc64). DLPAR will directly online all
hotplugged memory (LMBs) from the kernel using device_online(), which
will go to ZONE_NORMAL.

When trying to remove memory, it simply scans for offlineable 16MB
memory blocks (==section == LMB), offlines and removes them. No need for
the movable zone and all the involved issues.

Now, the interesting question is, can we have LMBs added during boot
(not via add_memory()), that will later be removed via remove_memory().
IIRC, we had BUGs related to that, so I think yes. If a section contains
no unmovable allocations (after boot), it can get removed.
Dave Young April 14, 2020, 9:38 a.m. UTC | #33
> We do not remove kexec_load at all, it is indeed helpful in many cases
> as all agreed.  But if we have a bug reported for both we may fix
> kexec_file_load first because it is usually easier, also do not need to
> worry about too much about old kernel and new kernel compatibility.
> 
> For example the recent breakage we found in efi path, kexec_file_load
> just work after the efi cleanup, but kexec_load depends on the ABI we
> added, so we must fix it as below:
> https://lore.kernel.org/linux-efi/20200410135644.GB6772@dhcp-128-65.nay.redhat.com/ 

Also, we have some specific sysfs files exported for kexec-tools use
/sys/firmware/efi/runtime-map/* and a few other table addresses:
fw_vendor  runtime and config_table under /sys/firmware/efi

That is only used by userspace kexec_tools for kexec_load, now the
runtime field is useless because of Ard's cleanup in efi code, but we
have to keep it there, older kexec-tools will need it.

In this case kexec_file_load do not need those hacks at all.  So in the
future if we have to invent some kernel/userspace abi only for
kexec_load we should be careful and maybe reject if no strong reason. 

Thanks
Dave
Baoquan He April 14, 2020, 2:39 p.m. UTC | #34
On 04/14/20 at 11:37am, David Hildenbrand wrote:
> On 14.04.20 11:22, Baoquan He wrote:
> > On 04/14/20 at 10:00am, David Hildenbrand wrote:
> >> On 14.04.20 08:40, Baoquan He wrote:
> >>> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
> >>>> Baoquan He <bhe@redhat.com> writes:
> >>>>
> >>>>> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
> >>>>>>
> >>>>>> The only benefit of kexec_file_load is that it is simple enough from a
> >>>>>> kernel perspective that signatures can be checked.
> >>>>>
> >>>>> We don't have this restriction any more with below commit:
> >>>>>
> >>>>> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
> >>>>> and KEXEC_SIG_FORCE")
> >>>>>
> >>>>> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
> >>>>> secure boot or legacy system for kexec/kdump. Being simple enough is
> >>>>> enough to astract and convince us to use it instead. And kexec_file_load
> >>>>> has been in use for several years on systems with secure boot, since
> >>>>> added in 2014, on x86_64.
> >>>>
> >>>> No.  Actaully kexec_file_load is the less capable interface, and less
> >>>> flexible interface.  Which is why it is appropriate for signature
> >>>> verification.
> >>>
> >>> Well, everyone has a stance and the corresponding view. You could have
> >>> wider view from long time maintenance and in upstrem position, and think
> >>> kexec_file_load is horrible. But I can only see from our work as a front
> >>> line engineer to maintain/develop kexec/kdump in RHEL, and think
> >>> kexec_file_load is easier to maintain.
> >>>
> >>> Surely except of multiple kernel image format support. No matter it is
> >>> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
> >>> This is produced from kerel building by default. We have no way to
> >>> support it in our distros and add it into kexec_file_load.
> >>>
> >>> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
> >>> https://lkml.org/lkml/2017/2/15/654
> >>>
> >>>>
> >>>>>> kexec_load in every other respect is the more capable and functional
> >>>>>> interface.  It makes no sense to get rid of it.
> >>>>>>
> >>>>>> It does make sense to reload with a loaded kernel on memory hotplug.
> >>>>>> That is simple and easy.  If we are going to handle something in the
> >>>>>> kernel it should simple an automated unloading of the kernel on memory
> >>>>>> hotplug.
> >>>>>>
> >>>>>>
> >>>>>> I think it would be irresponsible to deprecate kexec_load on any
> >>>>>> platform.
> >>>>>>
> >>>>>> I also suspect that kexec_file_load could be taught to copy the dtb
> >>>>>> on arm32 if someone wants to deal with signatures.
> >>>>>>
> >>>>>> We definitely can not even think of deprecating kexec_load until
> >>>>>> architecture that supports it also supports kexec_file_load and everyone
> >>>>>> is happy with that interface.  That is Linus's no regression rule.
> >>>>>
> >>>>> I should pick a milder word to express our tendency and tell our plan
> >>>>> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
> >>>>> much. I didn't mean to say 'deprecate' at all when replied.
> >>>>>
> >>>>> The situation and trend I understand about kexec_load and kexec_file_load
> >>>>> are:
> >>>>>
> >>>>> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
> >>>>> have yet, just as x86_64, arm64 and s390 have done;
> >>>>>  
> >>>>> 2) kexec_file_load is suggested to use, and take precedence over
> >>>>> kexec_load in the future, if both are supported in one ARCH.
> >>>>
> >>>> The deep problem is that kexec_file_load is distinctly less expressive
> >>>> than kexec_load.
> >>>>
> >>>>> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
> >>>>> and by ARCHes for back compatibility w/ kexec_file_load support.
> >>>>>
> >>>>> For 1) and 2), I think the reason is obvious as Eric said,
> >>>>> kexec_file_load is simple enough. And currently, whenever we got a bug
> >>>>> report, we may need fix them twice, for kexec_load and kexec_file_load.
> >>>>> If kexec_file_load is made by default, e.g on x86_64, we will change it
> >>>>> in kernel space only, for kexec_file_load. This is what I meant about
> >>>>> 'obsolete gradually'. I think for arm64, s390, they will do these too.
> >>>>> Unless there's some critical/blocker bug in kexec_load, to corrupt the
> >>>>> old kexec_load interface in old product.
> >>>>
> >>>> Maybe.  The code that kexec_file_load sucked into the kernel is quite
> >>>> stable and rarely needs changes except during a port of kexec to
> >>>> another architecture.
> >>>>
> >>>> Last I looked the real maintenance effor of kexec and kexec on panic was
> >>>> in the drivers.  So I don't think we can use maintenance to do anything.
> >>>
> >>> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
> >>> been taken to make SEV work well on kexec_file_load. And we have
> >>> switched to use kexec_file_load in the newly published  Fedora release
> >>> on x86_64 by default. Before this, Lianbo has investigated and done many
> >>> experiments to make sure the switching is safe. We finally made this
> >>> decision. Next we will do the switch in Enterprise distros. Once these
> >>> are proved safe, we will suggest customers to use kexec_file_load for
> >>> kexec rebooting too. In the future, we will only care about
> >>> kexec_file_load if everying is going well. But as I have explained
> >>> repeatedly, only caring about kexec_file_load means we will leave
> >>> kexec_load as is, we will not add new feature or improvement patches
> >>> for it.
> >>>
> >>> commit 6a20bd54473e11011bf2b47efb52d0759d412854
> >>> Author: Lianbo Jiang <lijiang@redhat.com>
> >>> Date:   Thu Jan 16 13:47:35 2020 +0800
> >>>
> >>>     kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default
> >>>
> >>>>
> >>>>> For 3), people can still use kexec_load and develop/fix for it, if no
> >>>>> kexec_file_load supported. But 32-bit arm should be a different one,
> >>>>> more like i386, we will leave it as is, and fix anything which could
> >>>>> break it. But people really expects to improve or add feature to it? E.g
> >>>>> in this patchset, the mem hotplug issue James raised, I assume James is
> >>>>> focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
> >>>>> another reply, people even don't agree to continue supporting memory
> >>>>> hotplug on 32-bit system. We ever took effort to fix a memory hotplug
> >>>>> bug on i386 with a patch, but people would rather set it as BROKEN.
> >>>>
> >>>> For memory hotplug just reload.  Userspace already gets good events.
> >>>
> >>> Kexec_file_load is easy to maintain. This is an example.
> >>>
> >>> Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
> >>> it's obviously not right. We can't disable memory hotplug just because
> >>> kexec-ed kernel is loaded ahead of time. 
> >>>
> >>> Reloading is also not a good fix. Kexec-ed kernel is targeted at a
> >>> movable area, reloading can avoid kexec rebooting corruption if that
> >>> area is hot removed. But if that area is not removed, locating kernel
> >>> into the hotpluggable area will change the area into ummovable zone.
> >>> Unless we decide to not support memory hotplug in kexec-ed kernel, I
> >>> guess it's very hard. Now in our distros kexec rebooting has been
> >>> supported, the big cloud providers are deploying linux in guest, bugs on
> >>> kexec reboot failure has been reported. They need the memory hotplug to
> >>> increase/decrease memory.
> >>>
> >>> The root cause is kexec-ed kernel is targeted at hotpluggable memory
> >>> region. Just avoiding the movable area can fix it. In kexec_file_load(),
> >>> just checking or picking those unmovable region to put kernel/initrd in
> >>> function locate_mem_hole_callback() can fix it. The page or pageblock's
> >>> zone is movable or not, it's easy to know. This fix doesn't need to
> >>> bother other component.
> >>
> >> I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
> >> does not imply that it cannot get offlined and removed e.g., this is
> >> heavily used on ppc64, with 16MB sections.
> > 
> > Really? I just know there are two kinds of mem hoplug in ppc, but don't
> > know the details. So in this case, is there any flag or a way to know
> > those memory block are hotpluggable? I am curious how those kernel data
> > is avoided to be put in this area. Or ppc just freely uses it for kernel
> > data or user space data, then try to migrate when hot remove?
> 
> See
> arch/powerpc/platforms/pseries/hotplug-memory.c:dlpar_memory_remove_by_count()
> 
> Under DLAPR, it can remove memory in LMB granularity, which is usually
> 16MB (== single section on ppc64). DLPAR will directly online all
> hotplugged memory (LMBs) from the kernel using device_online(), which
> will go to ZONE_NORMAL.
> 
> When trying to remove memory, it simply scans for offlineable 16MB
> memory blocks (==section == LMB), offlines and removes them. No need for
> the movable zone and all the involved issues.

Yes, this is a different one, thanks for pointing it out. It sounds like
balloon driver in virt platform, doesn't it?

Avoiding to put kexec kernel into movable zone can't solve this DLPAR
case as you said.

> 
> Now, the interesting question is, can we have LMBs added during boot
> (not via add_memory()), that will later be removed via remove_memory().
> IIRC, we had BUGs related to that, so I think yes. If a section contains
> no unmovable allocations (after boot), it can get removed.

I do want to ask this question. If we can add LMB into system RAM, then
reload kexec can solve it. 

Another better way is adding a common function to filter out the
movable zone when search position for kexec kernel, use a arch specific
funciton to filter out DLPAR memory blocks for ppc only. Over there,
we can simply use for_each_drmem_lmb() to do that.
David Hildenbrand April 14, 2020, 2:49 p.m. UTC | #35
On 14.04.20 16:39, Baoquan He wrote:
> On 04/14/20 at 11:37am, David Hildenbrand wrote:
>> On 14.04.20 11:22, Baoquan He wrote:
>>> On 04/14/20 at 10:00am, David Hildenbrand wrote:
>>>> On 14.04.20 08:40, Baoquan He wrote:
>>>>> On 04/13/20 at 08:15am, Eric W. Biederman wrote:
>>>>>> Baoquan He <bhe@redhat.com> writes:
>>>>>>
>>>>>>> On 04/12/20 at 02:52pm, Eric W. Biederman wrote:
>>>>>>>>
>>>>>>>> The only benefit of kexec_file_load is that it is simple enough from a
>>>>>>>> kernel perspective that signatures can be checked.
>>>>>>>
>>>>>>> We don't have this restriction any more with below commit:
>>>>>>>
>>>>>>> commit 99d5cadfde2b ("kexec_file: split KEXEC_VERIFY_SIG into KEXEC_SIG
>>>>>>> and KEXEC_SIG_FORCE")
>>>>>>>
>>>>>>> With KEXEC_SIG_FORCE not set, we can use kexec_load_file to cover both
>>>>>>> secure boot or legacy system for kexec/kdump. Being simple enough is
>>>>>>> enough to astract and convince us to use it instead. And kexec_file_load
>>>>>>> has been in use for several years on systems with secure boot, since
>>>>>>> added in 2014, on x86_64.
>>>>>>
>>>>>> No.  Actaully kexec_file_load is the less capable interface, and less
>>>>>> flexible interface.  Which is why it is appropriate for signature
>>>>>> verification.
>>>>>
>>>>> Well, everyone has a stance and the corresponding view. You could have
>>>>> wider view from long time maintenance and in upstrem position, and think
>>>>> kexec_file_load is horrible. But I can only see from our work as a front
>>>>> line engineer to maintain/develop kexec/kdump in RHEL, and think
>>>>> kexec_file_load is easier to maintain.
>>>>>
>>>>> Surely except of multiple kernel image format support. No matter it is
>>>>> kexec_load and kexec_file_load, e.g in x86_64, we only support bzImage.
>>>>> This is produced from kerel building by default. We have no way to
>>>>> support it in our distros and add it into kexec_file_load.
>>>>>
>>>>> [RFC PATCH] x86/boot: make ELF kernel multiboot-able
>>>>> https://lkml.org/lkml/2017/2/15/654
>>>>>
>>>>>>
>>>>>>>> kexec_load in every other respect is the more capable and functional
>>>>>>>> interface.  It makes no sense to get rid of it.
>>>>>>>>
>>>>>>>> It does make sense to reload with a loaded kernel on memory hotplug.
>>>>>>>> That is simple and easy.  If we are going to handle something in the
>>>>>>>> kernel it should simple an automated unloading of the kernel on memory
>>>>>>>> hotplug.
>>>>>>>>
>>>>>>>>
>>>>>>>> I think it would be irresponsible to deprecate kexec_load on any
>>>>>>>> platform.
>>>>>>>>
>>>>>>>> I also suspect that kexec_file_load could be taught to copy the dtb
>>>>>>>> on arm32 if someone wants to deal with signatures.
>>>>>>>>
>>>>>>>> We definitely can not even think of deprecating kexec_load until
>>>>>>>> architecture that supports it also supports kexec_file_load and everyone
>>>>>>>> is happy with that interface.  That is Linus's no regression rule.
>>>>>>>
>>>>>>> I should pick a milder word to express our tendency and tell our plan
>>>>>>> then 'obsolete'. Even though I added 'gradually', seems it doesn't help
>>>>>>> much. I didn't mean to say 'deprecate' at all when replied.
>>>>>>>
>>>>>>> The situation and trend I understand about kexec_load and kexec_file_load
>>>>>>> are:
>>>>>>>
>>>>>>> 1) Supporting kexec_file_load is suggested to add in ARCHes which don't
>>>>>>> have yet, just as x86_64, arm64 and s390 have done;
>>>>>>>  
>>>>>>> 2) kexec_file_load is suggested to use, and take precedence over
>>>>>>> kexec_load in the future, if both are supported in one ARCH.
>>>>>>
>>>>>> The deep problem is that kexec_file_load is distinctly less expressive
>>>>>> than kexec_load.
>>>>>>
>>>>>>> 3) Kexec_load is kept being used by ARCHes w/o kexc_file_load support,
>>>>>>> and by ARCHes for back compatibility w/ kexec_file_load support.
>>>>>>>
>>>>>>> For 1) and 2), I think the reason is obvious as Eric said,
>>>>>>> kexec_file_load is simple enough. And currently, whenever we got a bug
>>>>>>> report, we may need fix them twice, for kexec_load and kexec_file_load.
>>>>>>> If kexec_file_load is made by default, e.g on x86_64, we will change it
>>>>>>> in kernel space only, for kexec_file_load. This is what I meant about
>>>>>>> 'obsolete gradually'. I think for arm64, s390, they will do these too.
>>>>>>> Unless there's some critical/blocker bug in kexec_load, to corrupt the
>>>>>>> old kexec_load interface in old product.
>>>>>>
>>>>>> Maybe.  The code that kexec_file_load sucked into the kernel is quite
>>>>>> stable and rarely needs changes except during a port of kexec to
>>>>>> another architecture.
>>>>>>
>>>>>> Last I looked the real maintenance effor of kexec and kexec on panic was
>>>>>> in the drivers.  So I don't think we can use maintenance to do anything.
>>>>>
>>>>> Not sure if I got it. But if check Lianbo's patches, a lot of effort has
>>>>> been taken to make SEV work well on kexec_file_load. And we have
>>>>> switched to use kexec_file_load in the newly published  Fedora release
>>>>> on x86_64 by default. Before this, Lianbo has investigated and done many
>>>>> experiments to make sure the switching is safe. We finally made this
>>>>> decision. Next we will do the switch in Enterprise distros. Once these
>>>>> are proved safe, we will suggest customers to use kexec_file_load for
>>>>> kexec rebooting too. In the future, we will only care about
>>>>> kexec_file_load if everying is going well. But as I have explained
>>>>> repeatedly, only caring about kexec_file_load means we will leave
>>>>> kexec_load as is, we will not add new feature or improvement patches
>>>>> for it.
>>>>>
>>>>> commit 6a20bd54473e11011bf2b47efb52d0759d412854
>>>>> Author: Lianbo Jiang <lijiang@redhat.com>
>>>>> Date:   Thu Jan 16 13:47:35 2020 +0800
>>>>>
>>>>>     kdump-lib: switch to the kexec_file_load() syscall on x86_64 by default
>>>>>
>>>>>>
>>>>>>> For 3), people can still use kexec_load and develop/fix for it, if no
>>>>>>> kexec_file_load supported. But 32-bit arm should be a different one,
>>>>>>> more like i386, we will leave it as is, and fix anything which could
>>>>>>> break it. But people really expects to improve or add feature to it? E.g
>>>>>>> in this patchset, the mem hotplug issue James raised, I assume James is
>>>>>>> focusing on arm64, x86_64, but not 32-bit arm. As DavidH commented in
>>>>>>> another reply, people even don't agree to continue supporting memory
>>>>>>> hotplug on 32-bit system. We ever took effort to fix a memory hotplug
>>>>>>> bug on i386 with a patch, but people would rather set it as BROKEN.
>>>>>>
>>>>>> For memory hotplug just reload.  Userspace already gets good events.
>>>>>
>>>>> Kexec_file_load is easy to maintain. This is an example.
>>>>>
>>>>> Lock the hotplug area where kexed-ed kernel is targeted in this patchset,
>>>>> it's obviously not right. We can't disable memory hotplug just because
>>>>> kexec-ed kernel is loaded ahead of time. 
>>>>>
>>>>> Reloading is also not a good fix. Kexec-ed kernel is targeted at a
>>>>> movable area, reloading can avoid kexec rebooting corruption if that
>>>>> area is hot removed. But if that area is not removed, locating kernel
>>>>> into the hotpluggable area will change the area into ummovable zone.
>>>>> Unless we decide to not support memory hotplug in kexec-ed kernel, I
>>>>> guess it's very hard. Now in our distros kexec rebooting has been
>>>>> supported, the big cloud providers are deploying linux in guest, bugs on
>>>>> kexec reboot failure has been reported. They need the memory hotplug to
>>>>> increase/decrease memory.
>>>>>
>>>>> The root cause is kexec-ed kernel is targeted at hotpluggable memory
>>>>> region. Just avoiding the movable area can fix it. In kexec_file_load(),
>>>>> just checking or picking those unmovable region to put kernel/initrd in
>>>>> function locate_mem_hole_callback() can fix it. The page or pageblock's
>>>>> zone is movable or not, it's easy to know. This fix doesn't need to
>>>>> bother other component.
>>>>
>>>> I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
>>>> does not imply that it cannot get offlined and removed e.g., this is
>>>> heavily used on ppc64, with 16MB sections.
>>>
>>> Really? I just know there are two kinds of mem hoplug in ppc, but don't
>>> know the details. So in this case, is there any flag or a way to know
>>> those memory block are hotpluggable? I am curious how those kernel data
>>> is avoided to be put in this area. Or ppc just freely uses it for kernel
>>> data or user space data, then try to migrate when hot remove?
>>
>> See
>> arch/powerpc/platforms/pseries/hotplug-memory.c:dlpar_memory_remove_by_count()
>>
>> Under DLAPR, it can remove memory in LMB granularity, which is usually
>> 16MB (== single section on ppc64). DLPAR will directly online all
>> hotplugged memory (LMBs) from the kernel using device_online(), which
>> will go to ZONE_NORMAL.
>>
>> When trying to remove memory, it simply scans for offlineable 16MB
>> memory blocks (==section == LMB), offlines and removes them. No need for
>> the movable zone and all the involved issues.
> 
> Yes, this is a different one, thanks for pointing it out. It sounds like
> balloon driver in virt platform, doesn't it?

With DLPAR there is a hypervisor involved (which manages the actual HW
DIMMs), so yes.

> 
> Avoiding to put kexec kernel into movable zone can't solve this DLPAR
> case as you said.
> 
>>
>> Now, the interesting question is, can we have LMBs added during boot
>> (not via add_memory()), that will later be removed via remove_memory().
>> IIRC, we had BUGs related to that, so I think yes. If a section contains
>> no unmovable allocations (after boot), it can get removed.
> 
> I do want to ask this question. If we can add LMB into system RAM, then
> reload kexec can solve it. 
> 
> Another better way is adding a common function to filter out the
> movable zone when search position for kexec kernel, use a arch specific
> funciton to filter out DLPAR memory blocks for ppc only. Over there,
> we can simply use for_each_drmem_lmb() to do that.

I was thinking about something similar. Maybe something like a notifier
that can be used to test if selected memory can be used for kexec
images. It would apply to

- arm64 and filter out all hotadded memory (IIRC, only boot memory can
  be used).
- powerpc to filter out all LMBs that can be removed (assuming not all
  memory corresponds to LMBs that can be removed, otherwise we're in
  trouble ... :) )
- virtio-mem to filter out all memory it added.
- hyper-v to filter out partially backed memory blocks (esp. the last
  memory block it added and only partially backed it by memory).

This would make it work for kexec_file_load(), however, I do wonder how
we would want to approach that from userspace kexec-tools when handling
it from kexec_load().
James Morse April 14, 2020, 4:55 p.m. UTC | #36
Hi guys,

On 14/04/2020 08:05, David Hildenbrand wrote:
> On 10.04.20 21:10, Andrew Morton wrote:
>> It's unclear (to me) what is the status of this patchset.  But it does appear that
>> an new version can be expected?

> I'd suggest to unqueue the patches until we have a consensus.

Certainly!


> While there are a couple of ideas floating around here, my current
> suggestion would be either
> 
> 1. Indicate all hotplugged memory as "System RAM (hotplugged)" in
> /proc/iomem and the firmware memmap (on all architectures). This will
> require kexec changes,

> but I would have assume that kexec has to be
> updated in lock-step with the kernel

News to me: I was using the version I first built when arm64's support was new. I've only
had to update it once when we had to change user-space.

I don't think debian updates kexec-tools when it updates the kernel.


Making changes to /proc/iomem means updating user-space again, (for kdump). I'd like to
avoid that if its at all possible.


> just like e.g., makedumpfile.
> Modify kexec() to not place the kexec kernel on these areas (easy) but
> still consider them as crash regions to dump. When loading a kexec
> kernel, validate in the kernel that the memory is appropriate.


> 2. Make kexec() reload the the kernel whenever we e.g., get a udev event
> for removal of memory in /sys/devices/system/memory/.

I don't think we can rely on user-space to do something,


> On every remove_memory(), invalidate the loaded kernel in the kernel.

This is an option, ... but its a change of behaviour. If user-space asks for two
impossible things, the second request should fail. Having the first-one disappear is a bit
spooky...

Fortunately user-space checks the 'kexec -l' bit happened before it calls reboot() behind
'kexec -e'. So this works, but is not intuitive.

("Did I load it? What changed and when? oh, half a mile up in dmesg is a message saying
the kernel discarded the kexec kernel last wednesday.")


> As I mentioned somewhere, 1. will be interesting for virtio-mem, where
> we don't want any kexec kernel to be placed on virtio-mem-added memory.

Do these virtio-mem-added regions need to be accessible by kdump?
(do we already need a user-space change for that?)


A third option, along the line of what I posted:

Split the 'offline' and 'removed' ideas, which David mentioned somewhere. We'd end up with
(yet) another notifier chain, that prevents the memory being removed, but you can still
mark it as offline in /sys/. (...I'm not quite sure why you would do that...)

This would need hooking up for ACPI (which covers x86 and arm64), and other architectures
mechanisms for doing this...
arm64 can then switch is arch hook that prevents 'bootmem' being removed to this new
notifier chain, as the kernel can only boot from that was present at boot.


My preference is 3, then 2.

I think 1 is slightly less desirable than a message at kexec time that the memory layout
has changed since load, and this might not work...



Thanks,

James
David Hildenbrand April 14, 2020, 5:41 p.m. UTC | #37
>> While there are a couple of ideas floating around here, my current
>> suggestion would be either
>>
>> 1. Indicate all hotplugged memory as "System RAM (hotplugged)" in
>> /proc/iomem and the firmware memmap (on all architectures). This will
>> require kexec changes,
> 
>> but I would have assume that kexec has to be
>> updated in lock-step with the kernel
> 
> News to me: I was using the version I first built when arm64's support was new. I've only
> had to update it once when we had to change user-space.
> 
> I don't think debian updates kexec-tools when it updates the kernel.

I would assume they are also not pushing the latest-greatest kernel in
their current release, after settling on a kexec version, no?

I think you can assume new kernels to require new kexec-tools versions
to provide all features.

> Making changes to /proc/iomem means updating user-space again, (for kdump). I'd like to
> avoid that if its at all possible.

Yes, it's not desirable, but if all that's not working is a "not all
memory will be dumped out of the box", at least I think this is
tolerable. It's not like we're completely breaking kexec.

Your current arm64 patches require the same change AFAIKS - and I think
we already have arm64 hotplug support in Linux distros.

As I said, similarly, makedumpfile has to be upgraded with every kernel
release to make kdump work as expected. And that is no big news I hope :)

>> just like e.g., makedumpfile.
>> Modify kexec() to not place the kexec kernel on these areas (easy) but
>> still consider them as crash regions to dump. When loading a kexec
>> kernel, validate in the kernel that the memory is appropriate.
> 
> 
>> 2. Make kexec() reload the the kernel whenever we e.g., get a udev event
>> for removal of memory in /sys/devices/system/memory/.
> 
> I don't think we can rely on user-space to do something,
> 
> 
>> On every remove_memory(), invalidate the loaded kernel in the kernel.
> 
> This is an option, ... but its a change of behaviour. If user-space asks for two
> impossible things, the second request should fail. Having the first-one disappear is a bit
> spooky...

We are talking about corner cases that are already broken, no?

> 
> Fortunately user-space checks the 'kexec -l' bit happened before it calls reboot() behind
> 'kexec -e'. So this works, but is not intuitive.
> 
> ("Did I load it? What changed and when? oh, half a mile up in dmesg is a message saying
> the kernel discarded the kexec kernel last wednesday.")
> 
> 
>> As I mentioned somewhere, 1. will be interesting for virtio-mem, where
>> we don't want any kexec kernel to be placed on virtio-mem-added memory.
> 
> Do these virtio-mem-added regions need to be accessible by kdump?
> (do we already need a user-space change for that?)

Yes, they have to be accessible by kdump. Currently, they are also
exported as "System RAM" via /proc/iomem - which is why dumping works
e.g., on x86-64 (we'll have to increase the #of memory resources that
can be considered in the future, but that's a different story and only
applies when adding more than 100GB of memory via virtio-mem or so)

But as virtio-mem is fairly new (IOW, about to get queued for
integration soonish), I could still change the memory resources to show
up differently ("System RAM (hotplugged)", "System RAM (virtio-mem)",
etc.) and teach kexec about them.

But learning that we are having similar problems on arm64 (and
theoretically on Hyper-V), I think it makes sense to discuss a solution
that will solve the other issues as well.

> 
> 
> A third option, along the line of what I posted:
> 
> Split the 'offline' and 'removed' ideas, which David mentioned somewhere. We'd end up with
> (yet) another notifier chain, that prevents the memory being removed, but you can still

I dislike limiting memory unplug - and especially making remove_memory()
fail - just because somebody once thought it would be a good place to
load - in the future - some kexec binary onto it.

> mark it as offline in /sys/. (...I'm not quite sure why you would do that...)
> 
> This would need hooking up for ACPI (which covers x86 and arm64), and other architectures
> mechanisms for doing this...
> arm64 can then switch is arch hook that prevents 'bootmem' being removed to this new
> notifier chain, as the kernel can only boot from that was present at boot.


We have two different problems here, right?

1. Don't place kexec binaries on specific memory areas (e.g., arm64,
virtio-mem, hyper-v, ...)

2. Figure out what to do when unplugging memory that was selected as a
target for kexec binaries.


For 1, I have a feeling that /proc/iomem could be the right solution,
eventually requiring kexec changes to handle kdump properly (IOW, dump
all memory). Indicating all hotplugged memory as "System RAM
(hotplugged)" would be the way to go here.

For 2, I think we should unload all kexec images in case they overlap
with memory to be removed (e.g., remove_memory() notifier, which cannot
stop removal, it's only an indication), and make userspace reload kexec
via udev events.


Also, we have to think about kexec_file_load() to deal with 1.
Baoquan He April 15, 2020, 2:35 a.m. UTC | #38
On 04/14/20 at 04:49pm, David Hildenbrand wrote:
> >>>>> The root cause is kexec-ed kernel is targeted at hotpluggable memory
> >>>>> region. Just avoiding the movable area can fix it. In kexec_file_load(),
> >>>>> just checking or picking those unmovable region to put kernel/initrd in
> >>>>> function locate_mem_hole_callback() can fix it. The page or pageblock's
> >>>>> zone is movable or not, it's easy to know. This fix doesn't need to
> >>>>> bother other component.
> >>>>
> >>>> I don't fully agree. E.g., just because memory is onlined to ZONE_NORMAL
> >>>> does not imply that it cannot get offlined and removed e.g., this is
> >>>> heavily used on ppc64, with 16MB sections.
> >>>
> >>> Really? I just know there are two kinds of mem hoplug in ppc, but don't
> >>> know the details. So in this case, is there any flag or a way to know
> >>> those memory block are hotpluggable? I am curious how those kernel data
> >>> is avoided to be put in this area. Or ppc just freely uses it for kernel
> >>> data or user space data, then try to migrate when hot remove?
> >>
> >> See
> >> arch/powerpc/platforms/pseries/hotplug-memory.c:dlpar_memory_remove_by_count()
> >>
> >> Under DLAPR, it can remove memory in LMB granularity, which is usually
> >> 16MB (== single section on ppc64). DLPAR will directly online all
> >> hotplugged memory (LMBs) from the kernel using device_online(), which
> >> will go to ZONE_NORMAL.
> >>
> >> When trying to remove memory, it simply scans for offlineable 16MB
> >> memory blocks (==section == LMB), offlines and removes them. No need for
> >> the movable zone and all the involved issues.
> > 
> > Yes, this is a different one, thanks for pointing it out. It sounds like
> > balloon driver in virt platform, doesn't it?
> 
> With DLPAR there is a hypervisor involved (which manages the actual HW
> DIMMs), so yes.
> 
> > 
> > Avoiding to put kexec kernel into movable zone can't solve this DLPAR
> > case as you said.
> > 
> >>
> >> Now, the interesting question is, can we have LMBs added during boot
> >> (not via add_memory()), that will later be removed via remove_memory().
> >> IIRC, we had BUGs related to that, so I think yes. If a section contains
> >> no unmovable allocations (after boot), it can get removed.
> > 
> > I do want to ask this question. If we can add LMB into system RAM, then
> > reload kexec can solve it. 
> > 
> > Another better way is adding a common function to filter out the
> > movable zone when search position for kexec kernel, use a arch specific
> > funciton to filter out DLPAR memory blocks for ppc only. Over there,
> > we can simply use for_each_drmem_lmb() to do that.
> 
> I was thinking about something similar. Maybe something like a notifier
> that can be used to test if selected memory can be used for kexec

Not sure if I get the notifier idea clearly. If you mean 

1) Add a common function to pick memory in unmovable zone;
2) Let DLPAR, balloon register with notifier;
3) In the common function, ask notified part to check if the picked
   unmovable memory is available for locating kexec kernel;

Sounds doable to me, and not complicated.

> images. It would apply to
> 
> - arm64 and filter out all hotadded memory (IIRC, only boot memory can
>   be used).

Do you mean hot added memory after boot can't be recognized and added
into system RAM on arm64?


> - powerpc to filter out all LMBs that can be removed (assuming not all
>   memory corresponds to LMBs that can be removed, otherwise we're in
>   trouble ... :) )
> - virtio-mem to filter out all memory it added.
> - hyper-v to filter out partially backed memory blocks (esp. the last
>   memory block it added and only partially backed it by memory).
> 
> This would make it work for kexec_file_load(), however, I do wonder how
> we would want to approach that from userspace kexec-tools when handling
> it from kexec_load().

Let's make kexec_file_load work firstly. Since this work is only first
step to make kexec-ed kernel not break memory hotplug. After kexec
rebooting, the KASLR may locate kernel into hotpluggable area too.
Eric W. Biederman April 15, 2020, 8:33 p.m. UTC | #39
James Morse <james.morse@arm.com> writes:

> An image loaded for kexec is not stored in place, instead its segments
> are scattered through memory, and are re-assembled when needed. In the
> meantime, the target memory may have been removed.
>
> Because mm is not aware that this memory is still in use, it allows it
> to be removed.
>
> Add a memory notifier to prevent the removal of memory regions that
> overlap with a loaded kexec image segment. e.g., when triggered from the
> Qemu console:
> | kexec_core: memory region in use
> | memory memory32: Offline failed.
>
> Signed-off-by: James Morse <james.morse@arm.com>

Given that we are talking about the destination pages for kexec
not where the loaded kernel is currently stored the description is
confusing.

Beyond that I think it would be better to simply unload the loaded
kernel at memory hotunplug time.  Usually somewhere in the loaded image
is a copy of the memory map at the time the kexec kernel was loaded.
That will invalidate the memory map as well.

All of this should be for a very brief window of a few seconds, as
the loaded kexec image is quite short.

So instead of failing in the notifier, if you could simply unload the
loaded image in the notifier I think that would be simpler and more
robust.  While still preventing the loaded image from falling over
when it starts executing.

Eric
David Hildenbrand April 16, 2020, 1:31 p.m. UTC | #40
> Not sure if I get the notifier idea clearly. If you mean 
> 
> 1) Add a common function to pick memory in unmovable zone;

Not strictly required IMHO. But, minor detail.

> 2) Let DLPAR, balloon register with notifier;

Yeah, or virtio-mem, or any other technology that adds/removes memory
dynamically.

> 3) In the common function, ask notified part to check if the picked
>    unmovable memory is available for locating kexec kernel;

Yeah.

> 
> Sounds doable to me, and not complicated.
> 
>> images. It would apply to
>>
>> - arm64 and filter out all hotadded memory (IIRC, only boot memory can
>>   be used).
> 
> Do you mean hot added memory after boot can't be recognized and added
> into system RAM on arm64?

See patch #3 of this patch set, which wants to avoid placing kexec
binaries on hotplugged memory. But I have no idea what the current plan
regarding arm64 is (this thread exploded :) ).

I would assume that we don't want to place kexec images on any
hotplugged (or rather: hot(un)pluggable) memory - on any architecture.

> 
> 
>> - powerpc to filter out all LMBs that can be removed (assuming not all
>>   memory corresponds to LMBs that can be removed, otherwise we're in
>>   trouble ... :) )
>> - virtio-mem to filter out all memory it added.
>> - hyper-v to filter out partially backed memory blocks (esp. the last
>>   memory block it added and only partially backed it by memory).
>>
>> This would make it work for kexec_file_load(), however, I do wonder how
>> we would want to approach that from userspace kexec-tools when handling
>> it from kexec_load().
> 
> Let's make kexec_file_load work firstly. Since this work is only first
> step to make kexec-ed kernel not break memory hotplug. After kexec
> rebooting, the KASLR may locate kernel into hotpluggable area too.

Can you elaborate how that would work?
Baoquan He April 16, 2020, 2:02 p.m. UTC | #41
On 04/16/20 at 03:31pm, David Hildenbrand wrote:
> > Not sure if I get the notifier idea clearly. If you mean 
> > 
> > 1) Add a common function to pick memory in unmovable zone;
> 
> Not strictly required IMHO. But, minor detail.
> 
> > 2) Let DLPAR, balloon register with notifier;
> 
> Yeah, or virtio-mem, or any other technology that adds/removes memory
> dynamically.
> 
> > 3) In the common function, ask notified part to check if the picked
> >    unmovable memory is available for locating kexec kernel;
> 
> Yeah.

These may not be needed, please see below comment.

> 
> > 
> > Sounds doable to me, and not complicated.
> > 
> >> images. It would apply to
> >>
> >> - arm64 and filter out all hotadded memory (IIRC, only boot memory can
> >>   be used).
> > 
> > Do you mean hot added memory after boot can't be recognized and added
> > into system RAM on arm64?
> 
> See patch #3 of this patch set, which wants to avoid placing kexec
> binaries on hotplugged memory. But I have no idea what the current plan
> regarding arm64 is (this thread exploded :) ).
> 
> I would assume that we don't want to place kexec images on any
> hotplugged (or rather: hot(un)pluggable) memory - on any architecture.

Yes, noticed that and James replied to DaveY.

Later, when I was considering to make a draft patch to do the picking of
memory from normal zone, and add a notifier, as we discussed at above, I
suddenly realized that kexec_file_load doesn't have this issue. It
traverse system RAM bottom up to get an available region to put
kernel/initrd/boot_param, etc. I can't think of a system where its
low memory could be unavailable.
> 
> > 
> > 
> >> - powerpc to filter out all LMBs that can be removed (assuming not all
> >>   memory corresponds to LMBs that can be removed, otherwise we're in
> >>   trouble ... :) )
> >> - virtio-mem to filter out all memory it added.
> >> - hyper-v to filter out partially backed memory blocks (esp. the last
> >>   memory block it added and only partially backed it by memory).
> >>
> >> This would make it work for kexec_file_load(), however, I do wonder how
> >> we would want to approach that from userspace kexec-tools when handling
> >> it from kexec_load().
> > 
> > Let's make kexec_file_load work firstly. Since this work is only first
> > step to make kexec-ed kernel not break memory hotplug. After kexec
> > rebooting, the KASLR may locate kernel into hotpluggable area too.
> 
> Can you elaborate how that would work?

Well, boot memory can be hotplugged or not after boot, they are marked
in uefi tables, the current kexec doesn't save and pass them into 2nd
kenrel, when kexec kernel bootup, it need read them and avoid them to
randomize kernel into.
David Hildenbrand April 16, 2020, 2:09 p.m. UTC | #42
>>> Sounds doable to me, and not complicated.
>>>
>>>> images. It would apply to
>>>>
>>>> - arm64 and filter out all hotadded memory (IIRC, only boot memory can
>>>>   be used).
>>>
>>> Do you mean hot added memory after boot can't be recognized and added
>>> into system RAM on arm64?
>>
>> See patch #3 of this patch set, which wants to avoid placing kexec
>> binaries on hotplugged memory. But I have no idea what the current plan
>> regarding arm64 is (this thread exploded :) ).
>>
>> I would assume that we don't want to place kexec images on any
>> hotplugged (or rather: hot(un)pluggable) memory - on any architecture.
> 
> Yes, noticed that and James replied to DaveY.
> 
> Later, when I was considering to make a draft patch to do the picking of
> memory from normal zone, and add a notifier, as we discussed at above, I
> suddenly realized that kexec_file_load doesn't have this issue. It
> traverse system RAM bottom up to get an available region to put
> kernel/initrd/boot_param, etc. I can't think of a system where its
> low memory could be unavailable.

kexec_walk_memblock() has the option for "kbuf->top_down". Only
kexec_walk_resources() seems to ignore it.

So I think in case of memblocks (e.g., arm64), this still applies?

>>
>>>
>>>
>>>> - powerpc to filter out all LMBs that can be removed (assuming not all
>>>>   memory corresponds to LMBs that can be removed, otherwise we're in
>>>>   trouble ... :) )
>>>> - virtio-mem to filter out all memory it added.
>>>> - hyper-v to filter out partially backed memory blocks (esp. the last
>>>>   memory block it added and only partially backed it by memory).
>>>>
>>>> This would make it work for kexec_file_load(), however, I do wonder how
>>>> we would want to approach that from userspace kexec-tools when handling
>>>> it from kexec_load().
>>>
>>> Let's make kexec_file_load work firstly. Since this work is only first
>>> step to make kexec-ed kernel not break memory hotplug. After kexec
>>> rebooting, the KASLR may locate kernel into hotpluggable area too.
>>
>> Can you elaborate how that would work?
> 
> Well, boot memory can be hotplugged or not after boot, they are marked
> in uefi tables, the current kexec doesn't save and pass them into 2nd
> kenrel, when kexec kernel bootup, it need read them and avoid them to
> randomize kernel into.

What about e.g., memory hotplugged by ACPI? I would assume, that the
kexec kernel will not make use of that (IOW detected that) until the
ACPI driver comes up and re-detects + adds that memory.

Or how would that machinery work in case we have a DIMM hotplugged via ACPI?
Baoquan He April 16, 2020, 2:36 p.m. UTC | #43
On 04/16/20 at 04:09pm, David Hildenbrand wrote:
> >>> Sounds doable to me, and not complicated.
> >>>
> >>>> images. It would apply to
> >>>>
> >>>> - arm64 and filter out all hotadded memory (IIRC, only boot memory can
> >>>>   be used).
> >>>
> >>> Do you mean hot added memory after boot can't be recognized and added
> >>> into system RAM on arm64?
> >>
> >> See patch #3 of this patch set, which wants to avoid placing kexec
> >> binaries on hotplugged memory. But I have no idea what the current plan
> >> regarding arm64 is (this thread exploded :) ).
> >>
> >> I would assume that we don't want to place kexec images on any
> >> hotplugged (or rather: hot(un)pluggable) memory - on any architecture.
> > 
> > Yes, noticed that and James replied to DaveY.
> > 
> > Later, when I was considering to make a draft patch to do the picking of
> > memory from normal zone, and add a notifier, as we discussed at above, I
> > suddenly realized that kexec_file_load doesn't have this issue. It
> > traverse system RAM bottom up to get an available region to put
> > kernel/initrd/boot_param, etc. I can't think of a system where its
> > low memory could be unavailable.
> 
> kexec_walk_memblock() has the option for "kbuf->top_down". Only
> kexec_walk_resources() seems to ignore it.

Yeah, that top down searching is done in a found low mem area. Means
firstly search an available region bottom up, then put kernel top down
in that region. The reason is our iomem res is linked with singly linked
list. So we can only search bottom up efficiently.

kexec_load is doing the real top down searching, so kernel will be put
at the top of system ram. I ever tried to change it to support top down
searching for kexec_file_load too with patches, since QE and customers
are often confused with this difference when debugging.

Andrew may remeber this, he suggested me to change the singly linked list 
to doubly linked list for iomem res, then do the top down searching for
kexec_file_load. I tried with some effort, the change introduced too much
code change, I just gave up finally.

http://archive.lwn.net:8080/devicetree/20180718024944.577-1-bhe@redhat.com/

I can see that top down searching for kexec can avoid the highly used
low memory region, esp under 4G, for dma, kinds of firmware reserving,
etc. And customers/QE of kexec get used to it. I can change kexec_file_load
to top down too with a simple way if people really complain it. But now, 
seems bottom up is not bad too.

> 
> So I think in case of memblocks (e.g., arm64), this still applies?

Yeah, aren't you trying to remove it? I haven't read your patches
carefully, maybe I got it wrong. And arm64 even can't support the hot added
memory being able to recorded into firmware, seems it's not so ready, 
won't they change that design in the future?
> 
> >>
> >>>
> >>>
> >>>> - powerpc to filter out all LMBs that can be removed (assuming not all
> >>>>   memory corresponds to LMBs that can be removed, otherwise we're in
> >>>>   trouble ... :) )
> >>>> - virtio-mem to filter out all memory it added.
> >>>> - hyper-v to filter out partially backed memory blocks (esp. the last
> >>>>   memory block it added and only partially backed it by memory).
> >>>>
> >>>> This would make it work for kexec_file_load(), however, I do wonder how
> >>>> we would want to approach that from userspace kexec-tools when handling
> >>>> it from kexec_load().
> >>>
> >>> Let's make kexec_file_load work firstly. Since this work is only first
> >>> step to make kexec-ed kernel not break memory hotplug. After kexec
> >>> rebooting, the KASLR may locate kernel into hotpluggable area too.
> >>
> >> Can you elaborate how that would work?
> > 
> > Well, boot memory can be hotplugged or not after boot, they are marked
> > in uefi tables, the current kexec doesn't save and pass them into 2nd
> > kenrel, when kexec kernel bootup, it need read them and avoid them to
> > randomize kernel into.
> 
> What about e.g., memory hotplugged by ACPI? I would assume, that the
> kexec kernel will not make use of that (IOW detected that) until the
> ACPI driver comes up and re-detects + adds that memory.
> 
> Or how would that machinery work in case we have a DIMM hotplugged via ACPI?

ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
pass the efi, it won't get the SRAT table correctly, if I remember
correctly. Yeah, I remeber kvm guest can get memory hotplugged with
ACPI only, this won't happen on bare metal though. Need check carefully. 
I have been using kvm guest with uefi firmwire recently.
David Hildenbrand April 16, 2020, 2:47 p.m. UTC | #44
>> kexec_walk_memblock() has the option for "kbuf->top_down". Only
>> kexec_walk_resources() seems to ignore it.
> 
> Yeah, that top down searching is done in a found low mem area. Means
> firstly search an available region bottom up, then put kernel top down
> in that region. The reason is our iomem res is linked with singly linked
> list. So we can only search bottom up efficiently.
> 
> kexec_load is doing the real top down searching, so kernel will be put
> at the top of system ram. I ever tried to change it to support top down
> searching for kexec_file_load too with patches, since QE and customers
> are often confused with this difference when debugging.
> 
> Andrew may remeber this, he suggested me to change the singly linked list 
> to doubly linked list for iomem res, then do the top down searching for
> kexec_file_load. I tried with some effort, the change introduced too much
> code change, I just gave up finally.

Well, at least right now this seems to be the right approach (hotplug),
lol :)

> 
> http://archive.lwn.net:8080/devicetree/20180718024944.577-1-bhe@redhat.com/
> 
> I can see that top down searching for kexec can avoid the highly used
> low memory region, esp under 4G, for dma, kinds of firmware reserving,
> etc. And customers/QE of kexec get used to it. I can change kexec_file_load
> to top down too with a simple way if people really complain it. But now, 
> seems bottom up is not bad too.

Ah, I understand the problem. Maybe a simple "optimization" would be to
start searching bottom-up from e.g.,2GB/4GB first. If nothing was found,
search botoom-up from 0-2GB/4GB etc.

> 
>>
>> So I think in case of memblocks (e.g., arm64), this still applies?
> 
> Yeah, aren't you trying to remove it? I haven't read your patches
> carefully, maybe I got it wrong. And arm64 even can't support the hot added

For arm64 we're still creating memblocks for hotplugged memory, but I
guess it's not too hard to stop doing that.

> memory being able to recorded into firmware, seems it's not so ready, 
> won't they change that design in the future?

It seems to be incomplete, yes. No idea if it's fixable, no arm64 expert ...


>>>>>> - powerpc to filter out all LMBs that can be removed (assuming not all
>>>>>>   memory corresponds to LMBs that can be removed, otherwise we're in
>>>>>>   trouble ... :) )
>>>>>> - virtio-mem to filter out all memory it added.
>>>>>> - hyper-v to filter out partially backed memory blocks (esp. the last
>>>>>>   memory block it added and only partially backed it by memory).
>>>>>>
>>>>>> This would make it work for kexec_file_load(), however, I do wonder how
>>>>>> we would want to approach that from userspace kexec-tools when handling
>>>>>> it from kexec_load().
>>>>>
>>>>> Let's make kexec_file_load work firstly. Since this work is only first
>>>>> step to make kexec-ed kernel not break memory hotplug. After kexec
>>>>> rebooting, the KASLR may locate kernel into hotpluggable area too.
>>>>
>>>> Can you elaborate how that would work?
>>>
>>> Well, boot memory can be hotplugged or not after boot, they are marked
>>> in uefi tables, the current kexec doesn't save and pass them into 2nd
>>> kenrel, when kexec kernel bootup, it need read them and avoid them to
>>> randomize kernel into.
>>
>> What about e.g., memory hotplugged by ACPI? I would assume, that the
>> kexec kernel will not make use of that (IOW detected that) until the
>> ACPI driver comes up and re-detects + adds that memory.
>>
>> Or how would that machinery work in case we have a DIMM hotplugged via ACPI?
> 
> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
> pass the efi, it won't get the SRAT table correctly, if I remember
> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
> ACPI only, this won't happen on bare metal though. Need check carefully. 
> I have been using kvm guest with uefi firmwire recently.

Yeah, I can imagine that bare metal is different. kvm only uses ACPI.

I'm also asking because of virtio-mem. Memory added via virtio-mem is
not part of any efi tables or whatsoever. So I assume the kexec kernel
will not detect it automatically (good!), instead load the virtio-mem
driver and let it add memory back to the system.

I should probably play with kexec and virtio-mem once I have some spare
cycles ... to find out what's broken and needs to be addressed :)
David Hildenbrand April 21, 2020, 1:29 p.m. UTC | #45
>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
>> pass the efi, it won't get the SRAT table correctly, if I remember
>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
>> ACPI only, this won't happen on bare metal though. Need check carefully. 
>> I have been using kvm guest with uefi firmwire recently.
> 
> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
> 
> I'm also asking because of virtio-mem. Memory added via virtio-mem is
> not part of any efi tables or whatsoever. So I assume the kexec kernel
> will not detect it automatically (good!), instead load the virtio-mem
> driver and let it add memory back to the system.
> 
> I should probably play with kexec and virtio-mem once I have some spare
> cycles ... to find out what's broken and needs to be addressed :)

FWIW, I just gave virtio-mem and kexec/kdump a try.

a) kdump seems to work. Memory added by virtio-mem is getting dumped.
The kexec kernel only uses memory in the crash region. The virtio-mem
driver properly bails out due to is_kdump_kernel().

b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
to get placed on virtio-mem memory (pure luck due to the left-to-right
search). Memory added by virtio-mem is not getting added to the e820
map. Once the virtio-mem driver comes back up in the kexec kernel, the
right memory is readded.

c) "kexec -c -l" does not work properly. All memory added by virtio-mem
is added to the e820 map, which is wrong. Memory that should not be
touched will be touched by the kexec kernel. I assume kexec-tools just
goes ahead and adds anything it can find in /proc/iomem (or
/sys/firmware/memmap/) to the e820 map of the new kernel.

Due to c), I assume all hotplugged memory (e.g., ACPI DIMMs) is
similarly added to the e820 map and, therefore, won't be able to be
onlined MOVABLE easily.


At least for virtio-mem, I would either have to
a) Not support "kexec -c -l". A viable option if we would be planning on
not supporting it either way in the long term. I could block this
in-kernel somehow eventually.

b) Teach kexec-tools to leave virtio-mem added memory alone. E.g., by
indicating it in /proc/iomem in a special way ("System RAM
(hotplugged)"/"System RAM (virtio-mem)").

Baoquan, any opinion on that?
David Hildenbrand April 21, 2020, 1:57 p.m. UTC | #46
On 21.04.20 15:29, David Hildenbrand wrote:
>>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
>>> pass the efi, it won't get the SRAT table correctly, if I remember
>>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
>>> ACPI only, this won't happen on bare metal though. Need check carefully. 
>>> I have been using kvm guest with uefi firmwire recently.
>>
>> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
>>
>> I'm also asking because of virtio-mem. Memory added via virtio-mem is
>> not part of any efi tables or whatsoever. So I assume the kexec kernel
>> will not detect it automatically (good!), instead load the virtio-mem
>> driver and let it add memory back to the system.
>>
>> I should probably play with kexec and virtio-mem once I have some spare
>> cycles ... to find out what's broken and needs to be addressed :)
> 
> FWIW, I just gave virtio-mem and kexec/kdump a try.
> 
> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
> The kexec kernel only uses memory in the crash region. The virtio-mem
> driver properly bails out due to is_kdump_kernel().
> 
> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
> to get placed on virtio-mem memory (pure luck due to the left-to-right
> search). Memory added by virtio-mem is not getting added to the e820
> map. Once the virtio-mem driver comes back up in the kexec kernel, the
> right memory is readded.
> 
> c) "kexec -c -l" does not work properly. All memory added by virtio-mem
> is added to the e820 map, which is wrong. Memory that should not be
> touched will be touched by the kexec kernel. I assume kexec-tools just
> goes ahead and adds anything it can find in /proc/iomem (or
> /sys/firmware/memmap/) to the e820 map of the new kernel.
> 
> Due to c), I assume all hotplugged memory (e.g., ACPI DIMMs) is
> similarly added to the e820 map and, therefore, won't be able to be
> onlined MOVABLE easily.
> 
> 
> At least for virtio-mem, I would either have to
> a) Not support "kexec -c -l". A viable option if we would be planning on
> not supporting it either way in the long term. I could block this
> in-kernel somehow eventually.
> 
> b) Teach kexec-tools to leave virtio-mem added memory alone. E.g., by
> indicating it in /proc/iomem in a special way ("System RAM
> (hotplugged)"/"System RAM (virtio-mem)").

I just realized, that *not* creating /sys/firmware/memmap/ entries for
virtio-mem memory seems to be the right thing to do.
Eric W. Biederman April 21, 2020, 1:59 p.m. UTC | #47
David Hildenbrand <david@redhat.com> writes:

>>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
>>> pass the efi, it won't get the SRAT table correctly, if I remember
>>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
>>> ACPI only, this won't happen on bare metal though. Need check carefully. 
>>> I have been using kvm guest with uefi firmwire recently.
>> 
>> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
>> 
>> I'm also asking because of virtio-mem. Memory added via virtio-mem is
>> not part of any efi tables or whatsoever. So I assume the kexec kernel
>> will not detect it automatically (good!), instead load the virtio-mem
>> driver and let it add memory back to the system.
>> 
>> I should probably play with kexec and virtio-mem once I have some spare
>> cycles ... to find out what's broken and needs to be addressed :)
>
> FWIW, I just gave virtio-mem and kexec/kdump a try.
>
> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
> The kexec kernel only uses memory in the crash region. The virtio-mem
> driver properly bails out due to is_kdump_kernel().
>
> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
> to get placed on virtio-mem memory (pure luck due to the left-to-right
> search). Memory added by virtio-mem is not getting added to the e820
> map. Once the virtio-mem driver comes back up in the kexec kernel, the
> right memory is readded.

This sounds like a bug.

> c) "kexec -c -l" does not work properly. All memory added by virtio-mem
> is added to the e820 map, which is wrong. Memory that should not be
> touched will be touched by the kexec kernel. I assume kexec-tools just
> goes ahead and adds anything it can find in /proc/iomem (or
> /sys/firmware/memmap/) to the e820 map of the new kernel.
>
> Due to c), I assume all hotplugged memory (e.g., ACPI DIMMs) is
> similarly added to the e820 map and, therefore, won't be able to be
> onlined MOVABLE easily.

This sounds like correct behavior to me.  If you add memory to the
system it is treated as memory to the system.

If we need to make it a special kind of memory with special rules we can
have some kind of special marking for the memory.  But hotplugged is not
in itself a sufficient criteria to say don't use this as normal memory.

If take a huge server and I plug in an extra dimm it is just memory.

For a similarly huge server I might want to have memory that the system
booted with unpluggable, in case hardware error reporting notices
a dimm generating a lot of memory errors.

Now perhaps virtualization needs a special tier of memory that should
only be used for cases where the memory is easily movable.

I am not familiar with virtio-mem but my skim of the initial design
is that virtio-mem was not designed to be such a special tier of memory.
Perhaps something has changed?
https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg03870.html


> At least for virtio-mem, I would either have to
> a) Not support "kexec -c -l". A viable option if we would be planning on
> not supporting it either way in the long term. I could block this
> in-kernel somehow eventually.

No.

> b) Teach kexec-tools to leave virtio-mem added memory alone. E.g., by
> indicating it in /proc/iomem in a special way ("System RAM
> (hotplugged)"/"System RAM (virtio-mem)").

How does the kernel memory allocator treat this memory?

The logic is simple.  If the kernel memory allocator treats that memory
as ordinary memory available for all uses it should be presented as
ordinary memory available for all uses.

If the kernel memory allocator treats that memory as special memory
only available for uses that we can easily free later and give back to
the system.  AKA it is special and not oridinary memory we should mark
it as such.

Eric

p.s.  Please excuse me for jumping in I may be missing some important
context, but what I read when I saw this message in my inbox just seemed
very wrong.
David Hildenbrand April 21, 2020, 2:30 p.m. UTC | #48
>> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
>> to get placed on virtio-mem memory (pure luck due to the left-to-right
>> search). Memory added by virtio-mem is not getting added to the e820
>> map. Once the virtio-mem driver comes back up in the kexec kernel, the
>> right memory is readded.
> 
> This sounds like a bug.

This is how virtio-mem wants its memory to get handled.

> 
>> c) "kexec -c -l" does not work properly. All memory added by virtio-mem
>> is added to the e820 map, which is wrong. Memory that should not be
>> touched will be touched by the kexec kernel. I assume kexec-tools just
>> goes ahead and adds anything it can find in /proc/iomem (or
>> /sys/firmware/memmap/) to the e820 map of the new kernel.
>>
>> Due to c), I assume all hotplugged memory (e.g., ACPI DIMMs) is
>> similarly added to the e820 map and, therefore, won't be able to be
>> onlined MOVABLE easily.
> 
> This sounds like correct behavior to me.  If you add memory to the
> system it is treated as memory to the system.

Yeah, I would agree if we are talking about DIMMs, but this memory is
special. It's added via a paravirtualized interface and will contain
holes, especially after unplug. While memory in these holes can usually
be read, it should not be written. More on that below.

> 
> If we need to make it a special kind of memory with special rules we can
> have some kind of special marking for the memory.  But hotplugged is not
> in itself a sufficient criteria to say don't use this as normal memory.

Agreed. It is special, though.

> 
> If take a huge server and I plug in an extra dimm it is just memory.

Agreed.

[...]

> 
> Now perhaps virtualization needs a special tier of memory that should
> only be used for cases where the memory is easily movable.
> 
> I am not familiar with virtio-mem but my skim of the initial design
> is that virtio-mem was not designed to be such a special tier of memory.
> Perhaps something has changed?
> https://lists.gnu.org/archive/html/qemu-devel/2017-06/msg03870.html

Yes, a lot changed. See
https://lkml.kernel.org/r/20200311171422.10484-1-david@redhat.com for
the latest-greatest design overview.


> 
>> b) Teach kexec-tools to leave virtio-mem added memory alone. E.g., by
>> indicating it in /proc/iomem in a special way ("System RAM
>> (hotplugged)"/"System RAM (virtio-mem)").
> 
> How does the kernel memory allocator treat this memory?

So what virtio-mem does is add memory sections on demand and populate
within these sections the requested amount of memory. E.g., if 64MB are
requested, it will add a 128MB section/resource but only make the first
64MB accessible (via the hypervisor) and only give the first 64MB to the
buddy. This way of adding memory is similar to what XEN and hypver-v
balloon drivers do when hotplugging memory.

When requested to plug more memory, it might go ahead and make (parts
of) the remaining 64MB accessible and give them to the buddy. In case it
cannot "fill any holes", it will add a new section.

When requested to unplug memory, it will try to remove memory from the
added (here 64MB) memory from the buddy and tell the hypervisor about it.

So, it has some similarity to ballooning in virtual environment,
however, it manages its own device memory only and can therefore give
better guarantees and detect malicious guests.

Right now, I think the right approach would be to not create
/sys/firmware/memmap entries from memory virtio-mem added.

[...]

> 
> p.s.  Please excuse me for jumping in I may be missing some important
> context, but what I read when I saw this message in my inbox just seemed
> very wrong.

Yeah, still, thanks for having a look. Please let me know if you need
more information.
Baoquan He April 22, 2020, 9:17 a.m. UTC | #49
On 04/21/20 at 03:29pm, David Hildenbrand wrote:
> >> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
> >> pass the efi, it won't get the SRAT table correctly, if I remember
> >> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
> >> ACPI only, this won't happen on bare metal though. Need check carefully. 
> >> I have been using kvm guest with uefi firmwire recently.
> > 
> > Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
> > 
> > I'm also asking because of virtio-mem. Memory added via virtio-mem is
> > not part of any efi tables or whatsoever. So I assume the kexec kernel
> > will not detect it automatically (good!), instead load the virtio-mem
> > driver and let it add memory back to the system.
> > 
> > I should probably play with kexec and virtio-mem once I have some spare
> > cycles ... to find out what's broken and needs to be addressed :)
> 
> FWIW, I just gave virtio-mem and kexec/kdump a try.
> 
> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
> The kexec kernel only uses memory in the crash region. The virtio-mem
> driver properly bails out due to is_kdump_kernel().

Right, kdump is not impacted later added memory.

> 
> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
> to get placed on virtio-mem memory (pure luck due to the left-to-right
> search). Memory added by virtio-mem is not getting added to the e820
> map. Once the virtio-mem driver comes back up in the kexec kernel, the
> right memory is readded.

kexec_file_load just behaves as you tested. It doesn't collect later
added memory to e820 because it uses e820_table_kexec directly to pass
e820 to kexec-ed kernel. However, this e820_table_kexec is only updated
during boot stage. I tried hot adding DIMM after boot, kexec-ed kernel
doesn't have it in e820 during bootup, but it's recoginized and added
when ACPI scanning. I think we should update e820_table_kexec when hot
add/remove memory, at least for DIMM. Not sure if DLPAR, virtio-mem,
balloon will need be added into e820_table_kexec too, and if this is
expected behaviour.

But whatever we do, it won't impact the kexec file_loading, because of
the searching strategy bottom up. Just adding them into e820_table_kexec
will make it consistent with cold reboot which get recognizes and get
them into e820 during bootup.
> 
> c) "kexec -c -l" does not work properly. All memory added by virtio-mem
> is added to the e820 map, which is wrong. Memory that should not be
> touched will be touched by the kexec kernel. I assume kexec-tools just
> goes ahead and adds anything it can find in /proc/iomem (or
> /sys/firmware/memmap/) to the e820 map of the new kernel.
> 
> Due to c), I assume all hotplugged memory (e.g., ACPI DIMMs) is
> similarly added to the e820 map and, therefore, won't be able to be
> onlined MOVABLE easily.

Yes, kexec_load will read memory regions from /sys/firmware/memmap/ or
/proc/iomem. Making it right seems a little harder, we can export them
to /proc/iomem or /sys/firmware/memmap/ with mark them with 'hotplug',
but the attribute that which zone they belongs to is not easy to tell.

We are proactive on widely testing kexec_file_load on x86_64, s390,
arm64 by adding test cases into CKI.

> 
> 
> At least for virtio-mem, I would either have to
> a) Not support "kexec -c -l". A viable option if we would be planning on
> not supporting it either way in the long term. I could block this
> in-kernel somehow eventually.
> 
> b) Teach kexec-tools to leave virtio-mem added memory alone. E.g., by
> indicating it in /proc/iomem in a special way ("System RAM
> (hotplugged)"/"System RAM (virtio-mem)").
> 
> Baoquan, any opinion on that?
> 
> -- 
> Thanks,
> 
> David / dhildenb
David Hildenbrand April 22, 2020, 9:24 a.m. UTC | #50
On 22.04.20 11:17, Baoquan He wrote:
> On 04/21/20 at 03:29pm, David Hildenbrand wrote:
>>>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
>>>> pass the efi, it won't get the SRAT table correctly, if I remember
>>>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
>>>> ACPI only, this won't happen on bare metal though. Need check carefully. 
>>>> I have been using kvm guest with uefi firmwire recently.
>>>
>>> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
>>>
>>> I'm also asking because of virtio-mem. Memory added via virtio-mem is
>>> not part of any efi tables or whatsoever. So I assume the kexec kernel
>>> will not detect it automatically (good!), instead load the virtio-mem
>>> driver and let it add memory back to the system.
>>>
>>> I should probably play with kexec and virtio-mem once I have some spare
>>> cycles ... to find out what's broken and needs to be addressed :)
>>
>> FWIW, I just gave virtio-mem and kexec/kdump a try.
>>
>> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
>> The kexec kernel only uses memory in the crash region. The virtio-mem
>> driver properly bails out due to is_kdump_kernel().
> 
> Right, kdump is not impacted later added memory.
> 
>>
>> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
>> to get placed on virtio-mem memory (pure luck due to the left-to-right
>> search). Memory added by virtio-mem is not getting added to the e820
>> map. Once the virtio-mem driver comes back up in the kexec kernel, the
>> right memory is readded.
> 
> kexec_file_load just behaves as you tested. It doesn't collect later
> added memory to e820 because it uses e820_table_kexec directly to pass
> e820 to kexec-ed kernel. However, this e820_table_kexec is only updated
> during boot stage. I tried hot adding DIMM after boot, kexec-ed kernel
> doesn't have it in e820 during bootup, but it's recoginized and added
> when ACPI scanning. I think we should update e820_table_kexec when hot
> add/remove memory, at least for DIMM. Not sure if DLPAR, virtio-mem,
> balloon will need be added into e820_table_kexec too, and if this is
> expected behaviour.
> 
> But whatever we do, it won't impact the kexec file_loading, because of
> the searching strategy bottom up. Just adding them into e820_table_kexec
> will make it consistent with cold reboot which get recognizes and get
> them into e820 during bootup.

Yeah, I think whatever a cold-booted kernel will see is what kexec-ed
kernel should see. Not more, not less.

Regarding virtio-mem: Not in e820 on cold-boot.
Regarding DIMMs: DIMMs under KVM will never show up in the e820 map
IIRC. I think on real HW it can be different.
Baoquan He April 22, 2020, 9:57 a.m. UTC | #51
On 04/22/20 at 11:24am, David Hildenbrand wrote:
> On 22.04.20 11:17, Baoquan He wrote:
> > On 04/21/20 at 03:29pm, David Hildenbrand wrote:
> >>>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
> >>>> pass the efi, it won't get the SRAT table correctly, if I remember
> >>>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
> >>>> ACPI only, this won't happen on bare metal though. Need check carefully. 
> >>>> I have been using kvm guest with uefi firmwire recently.
> >>>
> >>> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
> >>>
> >>> I'm also asking because of virtio-mem. Memory added via virtio-mem is
> >>> not part of any efi tables or whatsoever. So I assume the kexec kernel
> >>> will not detect it automatically (good!), instead load the virtio-mem
> >>> driver and let it add memory back to the system.
> >>>
> >>> I should probably play with kexec and virtio-mem once I have some spare
> >>> cycles ... to find out what's broken and needs to be addressed :)
> >>
> >> FWIW, I just gave virtio-mem and kexec/kdump a try.
> >>
> >> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
> >> The kexec kernel only uses memory in the crash region. The virtio-mem
> >> driver properly bails out due to is_kdump_kernel().
> > 
> > Right, kdump is not impacted later added memory.
> > 
> >>
> >> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
> >> to get placed on virtio-mem memory (pure luck due to the left-to-right
> >> search). Memory added by virtio-mem is not getting added to the e820
> >> map. Once the virtio-mem driver comes back up in the kexec kernel, the
> >> right memory is readded.
> > 
> > kexec_file_load just behaves as you tested. It doesn't collect later
> > added memory to e820 because it uses e820_table_kexec directly to pass
> > e820 to kexec-ed kernel. However, this e820_table_kexec is only updated
> > during boot stage. I tried hot adding DIMM after boot, kexec-ed kernel
> > doesn't have it in e820 during bootup, but it's recoginized and added
> > when ACPI scanning. I think we should update e820_table_kexec when hot
> > add/remove memory, at least for DIMM. Not sure if DLPAR, virtio-mem,
> > balloon will need be added into e820_table_kexec too, and if this is
> > expected behaviour.
> > 
> > But whatever we do, it won't impact the kexec file_loading, because of
> > the searching strategy bottom up. Just adding them into e820_table_kexec
> > will make it consistent with cold reboot which get recognizes and get
> > them into e820 during bootup.
> 
> Yeah, I think whatever a cold-booted kernel will see is what kexec-ed
> kernel should see. Not more, not less.
> 
> Regarding virtio-mem: Not in e820 on cold-boot.
> Regarding DIMMs: DIMMs under KVM will never show up in the e820 map
> IIRC. I think on real HW it can be different.

Yeah, DIMMs under KVM won't show up in e820 map. While this is not feature
of QEMU/KVM, but a defect of it. I ever asked Igor who is developer of
QEMU/KVM guest in this area, why we don't make kvm guest recognize
hotpluggable DIMM and add it into e820 map, he said he had tried to make
it, but this will corrupt guest on HyperV. So he had to revert the
commit on qemu. So I think we can leave it for now for both real HW and
kvm, or update the e820_table_kexec to include added DIMM for both real
HW and KVM. I hope one day KVM dev will find a way to conquer the defect
on HyperV and make the e820map consistent with bare metal. After all,
kvm guest is trying to imitate real HW for the most part.

Anyway, I will think about the e820_table_kexec updating. See if we can
do something about it.
David Hildenbrand April 22, 2020, 10:05 a.m. UTC | #52
On 22.04.20 11:57, Baoquan He wrote:
> On 04/22/20 at 11:24am, David Hildenbrand wrote:
>> On 22.04.20 11:17, Baoquan He wrote:
>>> On 04/21/20 at 03:29pm, David Hildenbrand wrote:
>>>>>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
>>>>>> pass the efi, it won't get the SRAT table correctly, if I remember
>>>>>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
>>>>>> ACPI only, this won't happen on bare metal though. Need check carefully. 
>>>>>> I have been using kvm guest with uefi firmwire recently.
>>>>>
>>>>> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
>>>>>
>>>>> I'm also asking because of virtio-mem. Memory added via virtio-mem is
>>>>> not part of any efi tables or whatsoever. So I assume the kexec kernel
>>>>> will not detect it automatically (good!), instead load the virtio-mem
>>>>> driver and let it add memory back to the system.
>>>>>
>>>>> I should probably play with kexec and virtio-mem once I have some spare
>>>>> cycles ... to find out what's broken and needs to be addressed :)
>>>>
>>>> FWIW, I just gave virtio-mem and kexec/kdump a try.
>>>>
>>>> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
>>>> The kexec kernel only uses memory in the crash region. The virtio-mem
>>>> driver properly bails out due to is_kdump_kernel().
>>>
>>> Right, kdump is not impacted later added memory.
>>>
>>>>
>>>> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
>>>> to get placed on virtio-mem memory (pure luck due to the left-to-right
>>>> search). Memory added by virtio-mem is not getting added to the e820
>>>> map. Once the virtio-mem driver comes back up in the kexec kernel, the
>>>> right memory is readded.
>>>
>>> kexec_file_load just behaves as you tested. It doesn't collect later
>>> added memory to e820 because it uses e820_table_kexec directly to pass
>>> e820 to kexec-ed kernel. However, this e820_table_kexec is only updated
>>> during boot stage. I tried hot adding DIMM after boot, kexec-ed kernel
>>> doesn't have it in e820 during bootup, but it's recoginized and added
>>> when ACPI scanning. I think we should update e820_table_kexec when hot
>>> add/remove memory, at least for DIMM. Not sure if DLPAR, virtio-mem,
>>> balloon will need be added into e820_table_kexec too, and if this is
>>> expected behaviour.
>>>
>>> But whatever we do, it won't impact the kexec file_loading, because of
>>> the searching strategy bottom up. Just adding them into e820_table_kexec
>>> will make it consistent with cold reboot which get recognizes and get
>>> them into e820 during bootup.
>>
>> Yeah, I think whatever a cold-booted kernel will see is what kexec-ed
>> kernel should see. Not more, not less.
>>
>> Regarding virtio-mem: Not in e820 on cold-boot.
>> Regarding DIMMs: DIMMs under KVM will never show up in the e820 map
>> IIRC. I think on real HW it can be different.
> 
> Yeah, DIMMs under KVM won't show up in e820 map. While this is not feature
> of QEMU/KVM, but a defect of it. I ever asked Igor who is developer of
> QEMU/KVM guest in this area, why we don't make kvm guest recognize
> hotpluggable DIMM and add it into e820 map, he said he had tried to make
> it, but this will corrupt guest on HyperV. So he had to revert the

Yeah, I remember that this had to be reverted due to something breaking.
But OTOH, it allows us to online coldplugged DIMMs online_movable
easily, so I'd say it's even a feature (although, does not behave like
real HW we have).

I use this extensively when testing memory hot(un)plug via coldplugged
DIMMs.

I do wonder if there is real HW, where this is also the case.

> commit on qemu. So I think we can leave it for now for both real HW and
> kvm, or update the e820_table_kexec to include added DIMM for both real
> HW and KVM. I hope one day KVM dev will find a way to conquer the defect
> on HyperV and make the e820map consistent with bare metal. After all,
> kvm guest is trying to imitate real HW for the most part.
> 
> Anyway, I will think about the e820_table_kexec updating. See if we can
> do something about it.

Yeah, for DIMMs on real HW it might definitely make sense. We might be
able to hook into updates of /sys/firmware/memmap on memory add/remove.
Baoquan He April 22, 2020, 10:36 a.m. UTC | #53
On 04/22/20 at 12:05pm, David Hildenbrand wrote:
> On 22.04.20 11:57, Baoquan He wrote:
> > On 04/22/20 at 11:24am, David Hildenbrand wrote:
> >> On 22.04.20 11:17, Baoquan He wrote:
> >>> On 04/21/20 at 03:29pm, David Hildenbrand wrote:
> >>>>>> ACPI SRAT is embeded into efi, need read out the rsdp pointer. If we don't
> >>>>>> pass the efi, it won't get the SRAT table correctly, if I remember
> >>>>>> correctly. Yeah, I remeber kvm guest can get memory hotplugged with
> >>>>>> ACPI only, this won't happen on bare metal though. Need check carefully. 
> >>>>>> I have been using kvm guest with uefi firmwire recently.
> >>>>>
> >>>>> Yeah, I can imagine that bare metal is different. kvm only uses ACPI.
> >>>>>
> >>>>> I'm also asking because of virtio-mem. Memory added via virtio-mem is
> >>>>> not part of any efi tables or whatsoever. So I assume the kexec kernel
> >>>>> will not detect it automatically (good!), instead load the virtio-mem
> >>>>> driver and let it add memory back to the system.
> >>>>>
> >>>>> I should probably play with kexec and virtio-mem once I have some spare
> >>>>> cycles ... to find out what's broken and needs to be addressed :)
> >>>>
> >>>> FWIW, I just gave virtio-mem and kexec/kdump a try.
> >>>>
> >>>> a) kdump seems to work. Memory added by virtio-mem is getting dumped.
> >>>> The kexec kernel only uses memory in the crash region. The virtio-mem
> >>>> driver properly bails out due to is_kdump_kernel().
> >>>
> >>> Right, kdump is not impacted later added memory.
> >>>
> >>>>
> >>>> b) "kexec -s -l" seems to work fine. For now, the kernel does not seem
> >>>> to get placed on virtio-mem memory (pure luck due to the left-to-right
> >>>> search). Memory added by virtio-mem is not getting added to the e820
> >>>> map. Once the virtio-mem driver comes back up in the kexec kernel, the
> >>>> right memory is readded.
> >>>
> >>> kexec_file_load just behaves as you tested. It doesn't collect later
> >>> added memory to e820 because it uses e820_table_kexec directly to pass
> >>> e820 to kexec-ed kernel. However, this e820_table_kexec is only updated
> >>> during boot stage. I tried hot adding DIMM after boot, kexec-ed kernel
> >>> doesn't have it in e820 during bootup, but it's recoginized and added
> >>> when ACPI scanning. I think we should update e820_table_kexec when hot
> >>> add/remove memory, at least for DIMM. Not sure if DLPAR, virtio-mem,
> >>> balloon will need be added into e820_table_kexec too, and if this is
> >>> expected behaviour.
> >>>
> >>> But whatever we do, it won't impact the kexec file_loading, because of
> >>> the searching strategy bottom up. Just adding them into e820_table_kexec
> >>> will make it consistent with cold reboot which get recognizes and get
> >>> them into e820 during bootup.
> >>
> >> Yeah, I think whatever a cold-booted kernel will see is what kexec-ed
> >> kernel should see. Not more, not less.
> >>
> >> Regarding virtio-mem: Not in e820 on cold-boot.
> >> Regarding DIMMs: DIMMs under KVM will never show up in the e820 map
> >> IIRC. I think on real HW it can be different.
> > 
> > Yeah, DIMMs under KVM won't show up in e820 map. While this is not feature
> > of QEMU/KVM, but a defect of it. I ever asked Igor who is developer of
> > QEMU/KVM guest in this area, why we don't make kvm guest recognize
> > hotpluggable DIMM and add it into e820 map, he said he had tried to make
> > it, but this will corrupt guest on HyperV. So he had to revert the
> 
> Yeah, I remember that this had to be reverted due to something breaking.
> But OTOH, it allows us to online coldplugged DIMMs online_movable
> easily, so I'd say it's even a feature (although, does not behave like
> real HW we have).
> 
> I use this extensively when testing memory hot(un)plug via coldplugged
> DIMMs.
> 
> I do wonder if there is real HW, where this is also the case.

None for what I know. Hotplug on real HW includes two parts, the boot
mem being hotpluggable is more flexiable one. It allows people to
replace bad DIMM. And you can see code in boot stage has been adjusted a
lot on this purpose, at that time, people haven't thought about kvm
guest.

> 
> > commit on qemu. So I think we can leave it for now for both real HW and
> > kvm, or update the e820_table_kexec to include added DIMM for both real
> > HW and KVM. I hope one day KVM dev will find a way to conquer the defect
> > on HyperV and make the e820map consistent with bare metal. After all,
> > kvm guest is trying to imitate real HW for the most part.
> > 
> > Anyway, I will think about the e820_table_kexec updating. See if we can
> > do something about it.
> 
> Yeah, for DIMMs on real HW it might definitely make sense. We might be
> able to hook into updates of /sys/firmware/memmap on memory add/remove.
James Morse April 22, 2020, 12:28 p.m. UTC | #54
Hi Eric,

On 15/04/2020 21:33, Eric W. Biederman wrote:
> James Morse <james.morse@arm.com> writes:
> 
>> An image loaded for kexec is not stored in place, instead its segments
>> are scattered through memory, and are re-assembled when needed. In the
>> meantime, the target memory may have been removed.
>>
>> Because mm is not aware that this memory is still in use, it allows it
>> to be removed.
>>
>> Add a memory notifier to prevent the removal of memory regions that
>> overlap with a loaded kexec image segment. e.g., when triggered from the
>> Qemu console:
>> | kexec_core: memory region in use
>> | memory memory32: Offline failed.
>>
>> Signed-off-by: James Morse <james.morse@arm.com>
> 
> Given that we are talking about the destination pages for kexec
> not where the loaded kernel is currently stored the description is
> confusing.

I think David has some better wording to cover this. I thought I had it with 'scattered
and re-assembled'.


> Beyond that I think it would be better to simply unload the loaded
> kernel at memory hotunplug time.

Unconditionally, or if it aliases the removed region?

I don't particular like it. User-space has asked for two impossible things, we are
changing the answer to the first when we see the second. Its a bit spooky. (maybe no one
will notice)


> Usually somewhere in the loaded image
> is a copy of the memory map at the time the kexec kernel was loaded.
> That will invalidate the memory map as well.

Ah, unconditionally. Sure, x86 needs this.
(arm64 re-discovers the memory map from firmware tables after kexec)

If that's an acceptable change in behaviour, sure, lets do that.


> All of this should be for a very brief window of a few seconds, as
> the loaded kexec image is quite short.

It seems I'm the outlier anticipating anything could happen between those syscalls.


Thanks,

James
Eric W. Biederman April 22, 2020, 3:25 p.m. UTC | #55
James Morse <james.morse@arm.com> writes:

> Hi Eric,
>
> On 15/04/2020 21:33, Eric W. Biederman wrote:
>> James Morse <james.morse@arm.com> writes:
>> 
>>> An image loaded for kexec is not stored in place, instead its segments
>>> are scattered through memory, and are re-assembled when needed. In the
>>> meantime, the target memory may have been removed.
>>>
>>> Because mm is not aware that this memory is still in use, it allows it
>>> to be removed.
>>>
>>> Add a memory notifier to prevent the removal of memory regions that
>>> overlap with a loaded kexec image segment. e.g., when triggered from the
>>> Qemu console:
>>> | kexec_core: memory region in use
>>> | memory memory32: Offline failed.
>>>
>>> Signed-off-by: James Morse <james.morse@arm.com>
>> 
>> Given that we are talking about the destination pages for kexec
>> not where the loaded kernel is currently stored the description is
>> confusing.
>
> I think David has some better wording to cover this. I thought I had it with 'scattered
> and re-assembled'.

The confusing part was talking about memory being still in use,
that is actually scheduled for use in the future.

>> Usually somewhere in the loaded image
>> is a copy of the memory map at the time the kexec kernel was loaded.
>> That will invalidate the memory map as well.
>
> Ah, unconditionally. Sure, x86 needs this.
> (arm64 re-discovers the memory map from firmware tables after kexec)
>
> If that's an acceptable change in behaviour, sure, lets do that.

Yes.


>> All of this should be for a very brief window of a few seconds, as
>> the loaded kexec image is quite short.
>
> It seems I'm the outlier anticipating anything could happen between
> those syscalls.

The design is:
	sys_kexec_load()
	shutdown scripts
        sys_reboot(LINUX_REBOOT_CMD_KEXEC);

There are two system call simply so that the shutdown scripts can run.
Now maybe someone somewhere does something different but that is not
expected.

Only the kexec on panic kernel is expected to persist somewhat
indefinitely.  But that should be in memory that is reserved from boot
time, and so the memory hotplug should have enough visibility to not
allow that memory to be given up.

Eric
David Hildenbrand April 22, 2020, 4:40 p.m. UTC | #56
> The confusing part was talking about memory being still in use,
> that is actually scheduled for use in the future.

+1

> 
>>> Usually somewhere in the loaded image
>>> is a copy of the memory map at the time the kexec kernel was loaded.
>>> That will invalidate the memory map as well.
>>
>> Ah, unconditionally. Sure, x86 needs this.
>> (arm64 re-discovers the memory map from firmware tables after kexec)

Does this include hotplugged DIMMs e.g., under KVM?
[...]

>>> All of this should be for a very brief window of a few seconds, as
>>> the loaded kexec image is quite short.
>>
>> It seems I'm the outlier anticipating anything could happen between
>> those syscalls.
> 
> The design is:
> 	sys_kexec_load()
> 	shutdown scripts
>         sys_reboot(LINUX_REBOOT_CMD_KEXEC);
> 
> There are two system call simply so that the shutdown scripts can run.
> Now maybe someone somewhere does something different but that is not
> expected.
> 
> Only the kexec on panic kernel is expected to persist somewhat
> indefinitely.  But that should be in memory that is reserved from boot
> time, and so the memory hotplug should have enough visibility to not
> allow that memory to be given up.

Yes, and AFAIK, memory blocks which hold the reserved crashkernel area
can usually not get offlined and, therefore, the memory cannot get removed.

Interestingly, s390x even has a hotplug notifier for that

arch/s390/kernel/setup.c:kdump_mem_notifier()

(offlining of memory on s390x can result in memory getting depopulated
in the hypervisor, so after it would have been offlined, it would no
longer be accessible. I somewhat doubt that this notifier is really
needed - all pages in the crashkernel area should look like ordinary
allocated pages when the area is reserved early during boot via the
memblock allocator, and therefore offlining cannot succeed. But that's a
different story - and I suspect this is a leftover from pre-memblock times.)
Eric W. Biederman April 23, 2020, 4:29 p.m. UTC | #57
David Hildenbrand <david@redhat.com> writes:

>> The confusing part was talking about memory being still in use,
>> that is actually scheduled for use in the future.
>
> +1
>
>> 
>>>> Usually somewhere in the loaded image
>>>> is a copy of the memory map at the time the kexec kernel was loaded.
>>>> That will invalidate the memory map as well.
>>>
>>> Ah, unconditionally. Sure, x86 needs this.
>>> (arm64 re-discovers the memory map from firmware tables after kexec)
>
> Does this include hotplugged DIMMs e.g., under KVM?
> [...]

As far as I know.  If the memory map changes we need to drop the loaded
image.


Having thought about it a little more I suspect it would be the
other way and just block all hotplug actions after a kexec_load.
As all we expect to happen is running shutdown scripts.

If blocking the hotplug action uses printk to print a nice message
saying something like: "Hotplug blocked because of a loaded kexec image",
then people will be able to figure out what is going on and
call kexec -u if they haven't started the shutdown scripts yet.


Either way it is something simple and unconditional that will make
things work.

>>>> All of this should be for a very brief window of a few seconds, as
>>>> the loaded kexec image is quite short.
>>>
>>> It seems I'm the outlier anticipating anything could happen between
>>> those syscalls.
>> 
>> The design is:
>> 	sys_kexec_load()
>> 	shutdown scripts
>>         sys_reboot(LINUX_REBOOT_CMD_KEXEC);
>> 
>> There are two system call simply so that the shutdown scripts can run.
>> Now maybe someone somewhere does something different but that is not
>> expected.
>> 
>> Only the kexec on panic kernel is expected to persist somewhat
>> indefinitely.  But that should be in memory that is reserved from boot
>> time, and so the memory hotplug should have enough visibility to not
>> allow that memory to be given up.
>
> Yes, and AFAIK, memory blocks which hold the reserved crashkernel area
> can usually not get offlined and, therefore, the memory cannot get removed.
>
> Interestingly, s390x even has a hotplug notifier for that
>
> arch/s390/kernel/setup.c:kdump_mem_notifier()
>
> (offlining of memory on s390x can result in memory getting depopulated
> in the hypervisor, so after it would have been offlined, it would no
> longer be accessible. I somewhat doubt that this notifier is really
> needed - all pages in the crashkernel area should look like ordinary
> allocated pages when the area is reserved early during boot via the
> memblock allocator, and therefore offlining cannot succeed. But that's a
> different story - and I suspect this is a leftover from pre-memblock times.)

It might be worth seeing if that is true, or if we need to generalize the
s390x code.

Eric
David Hildenbrand April 24, 2020, 7:39 a.m. UTC | #58
On 23.04.20 18:29, Eric W. Biederman wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>>> The confusing part was talking about memory being still in use,
>>> that is actually scheduled for use in the future.
>>
>> +1
>>
>>>
>>>>> Usually somewhere in the loaded image
>>>>> is a copy of the memory map at the time the kexec kernel was loaded.
>>>>> That will invalidate the memory map as well.
>>>>
>>>> Ah, unconditionally. Sure, x86 needs this.
>>>> (arm64 re-discovers the memory map from firmware tables after kexec)
>>
>> Does this include hotplugged DIMMs e.g., under KVM?
>> [...]
> 
> As far as I know.  If the memory map changes we need to drop the loaded
> image.
> 
> 
> Having thought about it a little more I suspect it would be the
> other way and just block all hotplug actions after a kexec_load.
> As all we expect to happen is running shutdown scripts.
> 
> If blocking the hotplug action uses printk to print a nice message
> saying something like: "Hotplug blocked because of a loaded kexec image",
> then people will be able to figure out what is going on and
> call kexec -u if they haven't started the shutdown scripts yet.
> 
> 
> Either way it is something simple and unconditional that will make
> things work.
> 

Personally, I consider memory hotplug more important than keeping loaded
kexec data alive (just because somebody once decided to do a "kexec -l"
and never did a "kexec -e" we should not block any memory hot(un)plug -
especially in virtualized environments - for all eternity).

So IMHO we would invalidate loaded kexec data (not the crashkernel, of
course) on memory hot(un)plug and print a warning. In addition, we can
let kexec-tools try to reload whatever they loaded after getting
notified that something changed.

The "something changed" is visible to user space e.g., via udev events
for /sys/devices/memory/memoryX/

>>>>> All of this should be for a very brief window of a few seconds, as
>>>>> the loaded kexec image is quite short.
>>>>
>>>> It seems I'm the outlier anticipating anything could happen between
>>>> those syscalls.
>>>
>>> The design is:
>>> 	sys_kexec_load()
>>> 	shutdown scripts
>>>         sys_reboot(LINUX_REBOOT_CMD_KEXEC);
>>>
>>> There are two system call simply so that the shutdown scripts can run.
>>> Now maybe someone somewhere does something different but that is not
>>> expected.
>>>
>>> Only the kexec on panic kernel is expected to persist somewhat
>>> indefinitely.  But that should be in memory that is reserved from boot
>>> time, and so the memory hotplug should have enough visibility to not
>>> allow that memory to be given up.
>>
>> Yes, and AFAIK, memory blocks which hold the reserved crashkernel area
>> can usually not get offlined and, therefore, the memory cannot get removed.
>>
>> Interestingly, s390x even has a hotplug notifier for that
>>
>> arch/s390/kernel/setup.c:kdump_mem_notifier()
>>
>> (offlining of memory on s390x can result in memory getting depopulated
>> in the hypervisor, so after it would have been offlined, it would no
>> longer be accessible. I somewhat doubt that this notifier is really
>> needed - all pages in the crashkernel area should look like ordinary
>> allocated pages when the area is reserved early during boot via the
>> memblock allocator, and therefore offlining cannot succeed. But that's a
>> different story - and I suspect this is a leftover from pre-memblock times.)
> 
> It might be worth seeing if that is true, or if we need to generalize the
> s390x code.

I'll try to find some time to test if the s390x handler is still relevant.
David Hildenbrand April 24, 2020, 7:41 a.m. UTC | #59
On 24.04.20 09:39, David Hildenbrand wrote:
> On 23.04.20 18:29, Eric W. Biederman wrote:
>> David Hildenbrand <david@redhat.com> writes:
>>
>>>> The confusing part was talking about memory being still in use,
>>>> that is actually scheduled for use in the future.
>>>
>>> +1
>>>
>>>>
>>>>>> Usually somewhere in the loaded image
>>>>>> is a copy of the memory map at the time the kexec kernel was loaded.
>>>>>> That will invalidate the memory map as well.
>>>>>
>>>>> Ah, unconditionally. Sure, x86 needs this.
>>>>> (arm64 re-discovers the memory map from firmware tables after kexec)
>>>
>>> Does this include hotplugged DIMMs e.g., under KVM?
>>> [...]
>>
>> As far as I know.  If the memory map changes we need to drop the loaded
>> image.
>>
>>
>> Having thought about it a little more I suspect it would be the
>> other way and just block all hotplug actions after a kexec_load.
>> As all we expect to happen is running shutdown scripts.
>>
>> If blocking the hotplug action uses printk to print a nice message
>> saying something like: "Hotplug blocked because of a loaded kexec image",
>> then people will be able to figure out what is going on and
>> call kexec -u if they haven't started the shutdown scripts yet.
>>
>>
>> Either way it is something simple and unconditional that will make
>> things work.
>>
> 
> Personally, I consider memory hotplug more important than keeping loaded
> kexec data alive (just because somebody once decided to do a "kexec -l"
> and never did a "kexec -e" we should not block any memory hot(un)plug -
> especially in virtualized environments - for all eternity).
> 
> So IMHO we would invalidate loaded kexec data (not the crashkernel, of
> course) on memory hot(un)plug and print a warning. In addition, we can
> let kexec-tools try to reload whatever they loaded after getting
> notified that something changed.
> 
> The "something changed" is visible to user space e.g., via udev events
> for /sys/devices/memory/memoryX/

/sys/devices/system/memory/memoryX/ ...
James Morse May 1, 2020, 4:55 p.m. UTC | #60
Hi guys,

On 22/04/2020 17:40, David Hildenbrand wrote:
>>>> Usually somewhere in the loaded image
>>>> is a copy of the memory map at the time the kexec kernel was loaded.
>>>> That will invalidate the memory map as well.
>>>
>>> Ah, unconditionally. Sure, x86 needs this.
>>> (arm64 re-discovers the memory map from firmware tables after kexec)

> Does this include hotplugged DIMMs e.g., under KVM?

If you advertise hotplugged memory to the guest using ACPI, yes.

We don't have a practical mechanism to pass 'fact's about the platform between kernels,
instead we rely on those facts being discoverable, or described by firmware.


>>>> All of this should be for a very brief window of a few seconds, as
>>>> the loaded kexec image is quite short.
>>>
>>> It seems I'm the outlier anticipating anything could happen between
>>> those syscalls.
>>
>> The design is:
>> 	sys_kexec_load()
>> 	shutdown scripts
>>         sys_reboot(LINUX_REBOOT_CMD_KEXEC);
>>
>> There are two system call simply so that the shutdown scripts can run.
>> Now maybe someone somewhere does something different but that is not
>> expected.

[...]

> Yes, and AFAIK, memory blocks which hold the reserved crashkernel area
> can usually not get offlined and, therefore, the memory cannot get removed.

The crashkernel area on arm64 will always land in un-removable memory. We set PG_Reserved
on it too.


Thanks,

James
diff mbox series

Patch

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..ba1d91e868ca 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -12,6 +12,7 @@ 
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/kexec.h>
+#include <linux/memory.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
@@ -22,10 +23,12 @@ 
 #include <linux/elf.h>
 #include <linux/elfcore.h>
 #include <linux/utsname.h>
+#include <linux/notifier.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
 #include <linux/device.h>
 #include <linux/freezer.h>
+#include <linux/pfn.h>
 #include <linux/pm.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
@@ -1219,3 +1222,56 @@  void __weak arch_kexec_protect_crashkres(void)
 
 void __weak arch_kexec_unprotect_crashkres(void)
 {}
+
+/*
+ * If user-space wants to offline memory that is in use by a loaded kexec
+ * image, it should unload the image first.
+ */
+static int mem_remove_cb(struct notifier_block *nb, unsigned long action,
+			 void *data)
+{
+	int rv = NOTIFY_OK, i;
+	struct memory_notify *arg = data;
+	unsigned long pfn = arg->start_pfn;
+	unsigned long nr_segments, sstart, send;
+	unsigned long end_pfn = arg->start_pfn + arg->nr_pages;
+
+	might_sleep();
+
+	if (action != MEM_GOING_OFFLINE)
+		return NOTIFY_DONE;
+
+	mutex_lock(&kexec_mutex);
+	if (kexec_image) {
+		nr_segments = kexec_image->nr_segments;
+
+		for (i = 0; i < nr_segments; i++) {
+			sstart = PFN_DOWN(kexec_image->segment[i].mem);
+			send = PFN_UP(kexec_image->segment[i].mem +
+				      kexec_image->segment[i].memsz);
+
+			if ((pfn <= sstart && sstart < end_pfn) ||
+			    (pfn <= send && send < end_pfn)) {
+				pr_warn("Memory region in use\n");
+				rv = NOTIFY_BAD;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&kexec_mutex);
+
+	return rv;
+}
+
+static struct notifier_block mem_remove_nb = {
+	.notifier_call = mem_remove_cb,
+};
+
+static int __init register_mem_remove_cb(void)
+{
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+		return register_memory_notifier(&mem_remove_nb);
+
+	return 0;
+}
+device_initcall(register_mem_remove_cb);