diff mbox series

kexec: Discard loaded image on memory hotplug

Message ID 20200501165701.24587-1-james.morse@arm.com (mailing list archive)
State New, archived
Headers show
Series kexec: Discard loaded image on memory hotplug | expand

Commit Message

James Morse May 1, 2020, 4:57 p.m. UTC
On x86, the kexec payload contains a copy of the current memory map.
If memory is added or removed, this copy of the memory map becomes
stale. Getting this wrong may prevent the next kernel from booting.
The first kernel may die if it tries to re-assemble the next kernel
in memory that has been removed.

Discard the loaded kexec image when the memory map changes, user-space
should reload it.

Kdump is unaffected, as it is placed within the crashkernel reserved
memory area and only uses this memory. The stale memory map may affect
generation of the vmcore, but the kdump kernel should be in a position
to validate it.

Signed-off-by: James Morse <james.morse@arm.com>
---
This patch obsoletes:
 * kexec/memory_hotplug: Prevent removal and accidental use
https://lore.kernel.org/linux-arm-kernel/20200326180730.4754-1-james.morse@arm.com/

 kernel/kexec_core.c | 40 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

Comments

David Hildenbrand May 1, 2020, 5:26 p.m. UTC | #1
On 01.05.20 18:57, James Morse wrote:
> On x86, the kexec payload contains a copy of the current memory map.
> If memory is added or removed, this copy of the memory map becomes
> stale. Getting this wrong may prevent the next kernel from booting.
> The first kernel may die if it tries to re-assemble the next kernel
> in memory that has been removed.
> 
> Discard the loaded kexec image when the memory map changes, user-space
> should reload it.
> 
> Kdump is unaffected, as it is placed within the crashkernel reserved
> memory area and only uses this memory. The stale memory map may affect
> generation of the vmcore, but the kdump kernel should be in a position
> to validate it.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> This patch obsoletes:
>  * kexec/memory_hotplug: Prevent removal and accidental use
> https://lore.kernel.org/linux-arm-kernel/20200326180730.4754-1-james.morse@arm.com/
> 
>  kernel/kexec_core.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 40 insertions(+)
> 
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..e1901e5bd4b5 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -12,6 +12,7 @@
>  #include <linux/slab.h>
>  #include <linux/fs.h>
>  #include <linux/kexec.h>
> +#include <linux/memory.h>
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/highmem.h>
> @@ -22,10 +23,12 @@
>  #include <linux/elf.h>
>  #include <linux/elfcore.h>
>  #include <linux/utsname.h>
> +#include <linux/notifier.h>
>  #include <linux/numa.h>
>  #include <linux/suspend.h>
>  #include <linux/device.h>
>  #include <linux/freezer.h>
> +#include <linux/pfn.h>
>  #include <linux/pm.h>
>  #include <linux/cpu.h>
>  #include <linux/uaccess.h>
> @@ -1219,3 +1222,40 @@ void __weak arch_kexec_protect_crashkres(void)
>  
>  void __weak arch_kexec_unprotect_crashkres(void)
>  {}
> +
> +/*
> + * If the memory layout changes, any loaded kexec image should be evicted
> + * as it may contain a copy of the (now stale) memory map. This also means
> + * we don't need to check the memory is still present when re-assembling the
> + * new kernel at machine_kexec() time.
> + */

Onlining/offlining is not a change of the memory map.
Eric W. Biederman May 9, 2020, 3:14 p.m. UTC | #2
David Hildenbrand <david@redhat.com> writes:

> On 01.05.20 18:57, James Morse wrote:
>> On x86, the kexec payload contains a copy of the current memory map.
>> If memory is added or removed, this copy of the memory map becomes
>> stale. Getting this wrong may prevent the next kernel from booting.
>> The first kernel may die if it tries to re-assemble the next kernel
>> in memory that has been removed.
>> 
>> Discard the loaded kexec image when the memory map changes, user-space
>> should reload it.
>> 
>> Kdump is unaffected, as it is placed within the crashkernel reserved
>> memory area and only uses this memory. The stale memory map may affect
>> generation of the vmcore, but the kdump kernel should be in a position
>> to validate it.
>> 
>> Signed-off-by: James Morse <james.morse@arm.com>
>> ---
>> This patch obsoletes:
>>  * kexec/memory_hotplug: Prevent removal and accidental use
>> https://lore.kernel.org/linux-arm-kernel/20200326180730.4754-1-james.morse@arm.com/
>> 
>>  kernel/kexec_core.c | 40 ++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 40 insertions(+)
>> 
>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>> index c19c0dad1ebe..e1901e5bd4b5 100644
>> --- a/kernel/kexec_core.c
>> +++ b/kernel/kexec_core.c
>> @@ -12,6 +12,7 @@
>>  #include <linux/slab.h>
>>  #include <linux/fs.h>
>>  #include <linux/kexec.h>
>> +#include <linux/memory.h>
>>  #include <linux/mutex.h>
>>  #include <linux/list.h>
>>  #include <linux/highmem.h>
>> @@ -22,10 +23,12 @@
>>  #include <linux/elf.h>
>>  #include <linux/elfcore.h>
>>  #include <linux/utsname.h>
>> +#include <linux/notifier.h>
>>  #include <linux/numa.h>
>>  #include <linux/suspend.h>
>>  #include <linux/device.h>
>>  #include <linux/freezer.h>
>> +#include <linux/pfn.h>
>>  #include <linux/pm.h>
>>  #include <linux/cpu.h>
>>  #include <linux/uaccess.h>
>> @@ -1219,3 +1222,40 @@ void __weak arch_kexec_protect_crashkres(void)
>>  
>>  void __weak arch_kexec_unprotect_crashkres(void)
>>  {}
>> +
>> +/*
>> + * If the memory layout changes, any loaded kexec image should be evicted
>> + * as it may contain a copy of the (now stale) memory map. This also means
>> + * we don't need to check the memory is still present when re-assembling the
>> + * new kernel at machine_kexec() time.
>> + */
>
> Onlining/offlining is not a change of the memory map.

Phrasing it that way is non-sense.  What is important is memory
available in the system.  A memory map is just a reflection upon that,
a memory map is not the definition of truth.

So if this notifier reflects when memory is coming and going on the
system this is a reasonable approach.  

Do these notifiers might fire for special kinds of memory that should
only be used for very special purposes?

This change with the addition of some filters say to limit taking action
to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
filtering out special kinds of memory that is not gernally useful.

Eric
Baoquan He May 10, 2020, 1:06 p.m. UTC | #3
Hi James,

On 05/01/20 at 05:57pm, James Morse wrote:
> On x86, the kexec payload contains a copy of the current memory map.
> If memory is added or removed, this copy of the memory map becomes
> stale. Getting this wrong may prevent the next kernel from booting.
> The first kernel may die if it tries to re-assemble the next kernel
> in memory that has been removed.
> 
> Discard the loaded kexec image when the memory map changes, user-space
> should reload it.

As we have discarded in your patches thread, adding a kexec service to
reload kexec should fix this. Do you mean there's still another issue
that we need fix? I may not get it clearly.

> 
> Kdump is unaffected, as it is placed within the crashkernel reserved
> memory area and only uses this memory. The stale memory map may affect
> generation of the vmcore, but the kdump kernel should be in a position
> to validate it.
> 
> Signed-off-by: James Morse <james.morse@arm.com>
> ---
> This patch obsoletes:
>  * kexec/memory_hotplug: Prevent removal and accidental use
> https://lore.kernel.org/linux-arm-kernel/20200326180730.4754-1-james.morse@arm.com/
> 
>  kernel/kexec_core.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 40 insertions(+)
> 
> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
> index c19c0dad1ebe..e1901e5bd4b5 100644
> --- a/kernel/kexec_core.c
> +++ b/kernel/kexec_core.c
> @@ -12,6 +12,7 @@
>  #include <linux/slab.h>
>  #include <linux/fs.h>
>  #include <linux/kexec.h>
> +#include <linux/memory.h>
>  #include <linux/mutex.h>
>  #include <linux/list.h>
>  #include <linux/highmem.h>
> @@ -22,10 +23,12 @@
>  #include <linux/elf.h>
>  #include <linux/elfcore.h>
>  #include <linux/utsname.h>
> +#include <linux/notifier.h>
>  #include <linux/numa.h>
>  #include <linux/suspend.h>
>  #include <linux/device.h>
>  #include <linux/freezer.h>
> +#include <linux/pfn.h>
>  #include <linux/pm.h>
>  #include <linux/cpu.h>
>  #include <linux/uaccess.h>
> @@ -1219,3 +1222,40 @@ void __weak arch_kexec_protect_crashkres(void)
>  
>  void __weak arch_kexec_unprotect_crashkres(void)
>  {}
> +
> +/*
> + * If the memory layout changes, any loaded kexec image should be evicted
> + * as it may contain a copy of the (now stale) memory map. This also means
> + * we don't need to check the memory is still present when re-assembling the
> + * new kernel at machine_kexec() time.
> + */
> +static int mem_change_cb(struct notifier_block *nb, unsigned long action,
> +			 void *data)
> +{
> +	/*
> +	 * Actions are either a change, or a change being cancelled.
> +	 * A second discard for 'cancel's is harmless.
> +	 */
> +
> +	mutex_lock(&kexec_mutex);
> +	if (kexec_image) {
> +		kimage_free(xchg(&kexec_image, NULL));
> +		pr_warn("loaded image discarded due to memory hotplug");
> +	}
> +	mutex_unlock(&kexec_mutex);
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block mem_change_nb = {
> +	.notifier_call = mem_change_cb,
> +};
> +
> +static int __init register_mem_change_cb(void)
> +{
> +	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
> +		return register_memory_notifier(&mem_change_nb);
> +
> +	return 0;
> +}
> +device_initcall(register_mem_change_cb);
> -- 
> 2.26.1
> 
>
David Hildenbrand May 11, 2020, 8:19 a.m. UTC | #4
On 09.05.20 17:14, Eric W. Biederman wrote:
> David Hildenbrand <david@redhat.com> writes:
> 
>> On 01.05.20 18:57, James Morse wrote:
>>> On x86, the kexec payload contains a copy of the current memory map.
>>> If memory is added or removed, this copy of the memory map becomes
>>> stale. Getting this wrong may prevent the next kernel from booting.
>>> The first kernel may die if it tries to re-assemble the next kernel
>>> in memory that has been removed.
>>>
>>> Discard the loaded kexec image when the memory map changes, user-space
>>> should reload it.
>>>
>>> Kdump is unaffected, as it is placed within the crashkernel reserved
>>> memory area and only uses this memory. The stale memory map may affect
>>> generation of the vmcore, but the kdump kernel should be in a position
>>> to validate it.
>>>
>>> Signed-off-by: James Morse <james.morse@arm.com>
>>> ---
>>> This patch obsoletes:
>>>  * kexec/memory_hotplug: Prevent removal and accidental use
>>> https://lore.kernel.org/linux-arm-kernel/20200326180730.4754-1-james.morse@arm.com/
>>>
>>>  kernel/kexec_core.c | 40 ++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 40 insertions(+)
>>>
>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>> index c19c0dad1ebe..e1901e5bd4b5 100644
>>> --- a/kernel/kexec_core.c
>>> +++ b/kernel/kexec_core.c
>>> @@ -12,6 +12,7 @@
>>>  #include <linux/slab.h>
>>>  #include <linux/fs.h>
>>>  #include <linux/kexec.h>
>>> +#include <linux/memory.h>
>>>  #include <linux/mutex.h>
>>>  #include <linux/list.h>
>>>  #include <linux/highmem.h>
>>> @@ -22,10 +23,12 @@
>>>  #include <linux/elf.h>
>>>  #include <linux/elfcore.h>
>>>  #include <linux/utsname.h>
>>> +#include <linux/notifier.h>
>>>  #include <linux/numa.h>
>>>  #include <linux/suspend.h>
>>>  #include <linux/device.h>
>>>  #include <linux/freezer.h>
>>> +#include <linux/pfn.h>
>>>  #include <linux/pm.h>
>>>  #include <linux/cpu.h>
>>>  #include <linux/uaccess.h>
>>> @@ -1219,3 +1222,40 @@ void __weak arch_kexec_protect_crashkres(void)
>>>  
>>>  void __weak arch_kexec_unprotect_crashkres(void)
>>>  {}
>>> +
>>> +/*
>>> + * If the memory layout changes, any loaded kexec image should be evicted
>>> + * as it may contain a copy of the (now stale) memory map. This also means
>>> + * we don't need to check the memory is still present when re-assembling the
>>> + * new kernel at machine_kexec() time.
>>> + */
>>
>> Onlining/offlining is not a change of the memory map.
> 
> Phrasing it that way is non-sense.  What is important is memory
> available in the system.  A memory map is just a reflection upon that,
> a memory map is not the definition of truth.
> 
> So if this notifier reflects when memory is coming and going on the
> system this is a reasonable approach.  
> 
> Do these notifiers might fire for special kinds of memory that should
> only be used for very special purposes?
> 
> This change with the addition of some filters say to limit taking action
> to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
> filtering out special kinds of memory that is not gernally useful.

There are cases, where this notifier will not get called (e.g., hotplug
a DIMM and don't online it) or will get called, although nothing changed
(offline+re-online to a different zone triggered by user space). AFAIK,
nothing in kexec (*besides kdump) cares about online vs. offline memory.
This is why this feels wrong.

add_memory()/try_remove_memory() is the place where:
- Memblocks are created/deleted (if the memblock allocator is still
  alive)
- Memory resources are created/deleted (e.g., reflected in /proc/iomem)
- Firmware memmap entries are created/deleted (/sys/firmware/memmap)

My idea would be to add something like
kexec_map_add()/kexec_map_remove() where we have
firmware_map_add_hotplug()/firmware_map_remove(). From there, we can
unload the kexec image like done in this patch.

And these callbacks might come in handy for fixing up the kexec initial
memmap in case of kexec_file_load(). AFAIKS on x86_64:
- Hotplugging a DIMM will not add that memory to
  e820_table_kexec
- Hotunplugging a DIMM will not remove that memory from e820_table_kexec

Maybe we have similar things to handle on other architectures.
Baoquan He May 11, 2020, 11:27 a.m. UTC | #5
On 05/11/20 at 10:19am, David Hildenbrand wrote:
> On 09.05.20 17:14, Eric W. Biederman wrote:
> >>> + * If the memory layout changes, any loaded kexec image should be evicted
> >>> + * as it may contain a copy of the (now stale) memory map. This also means
> >>> + * we don't need to check the memory is still present when re-assembling the
> >>> + * new kernel at machine_kexec() time.
> >>> + */
> >>
> >> Onlining/offlining is not a change of the memory map.
> > 
> > Phrasing it that way is non-sense.  What is important is memory
> > available in the system.  A memory map is just a reflection upon that,
> > a memory map is not the definition of truth.
> > 
> > So if this notifier reflects when memory is coming and going on the
> > system this is a reasonable approach.  
> > 
> > Do these notifiers might fire for special kinds of memory that should
> > only be used for very special purposes?
> > 
> > This change with the addition of some filters say to limit taking action
> > to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
> > filtering out special kinds of memory that is not gernally useful.
> 
> There are cases, where this notifier will not get called (e.g., hotplug
> a DIMM and don't online it) or will get called, although nothing changed
> (offline+re-online to a different zone triggered by user space). AFAIK,
> nothing in kexec (*besides kdump) cares about online vs. offline memory.
> This is why this feels wrong.
> 
> add_memory()/try_remove_memory() is the place where:
> - Memblocks are created/deleted (if the memblock allocator is still
>   alive)
> - Memory resources are created/deleted (e.g., reflected in /proc/iomem)
> - Firmware memmap entries are created/deleted (/sys/firmware/memmap)
> 
> My idea would be to add something like
> kexec_map_add()/kexec_map_remove() where we have
> firmware_map_add_hotplug()/firmware_map_remove(). From there, we can
> unload the kexec image like done in this patch.

Hi David,

I may miss some details, do you know why we have to unload the kexec image
when add/remove memory?

If this is applied, even kexec_file_load is also affected. As we
discussed, kexec_file_load is not impacted by kinds of memory
adding/removing at all.

Besides, if unload image in casae memory added/removed, we will accept
that the later 'kexec -e' is actually rebooting?

Thanks
Baoquan

> 
> And these callbacks might come in handy for fixing up the kexec initial
> memmap in case of kexec_file_load(). AFAIKS on x86_64:
> - Hotplugging a DIMM will not add that memory to
>   e820_table_kexec
> - Hotunplugging a DIMM will not remove that memory from e820_table_kexec
> 
> Maybe we have similar things to handle on other architectures.
> 
> -- 
> Thanks,
> 
> David / dhildenb
> 
>
David Hildenbrand May 11, 2020, 11:55 a.m. UTC | #6
On 11.05.20 13:27, Baoquan He wrote:
> On 05/11/20 at 10:19am, David Hildenbrand wrote:
>> On 09.05.20 17:14, Eric W. Biederman wrote:
>>>>> + * If the memory layout changes, any loaded kexec image should be evicted
>>>>> + * as it may contain a copy of the (now stale) memory map. This also means
>>>>> + * we don't need to check the memory is still present when re-assembling the
>>>>> + * new kernel at machine_kexec() time.
>>>>> + */
>>>>
>>>> Onlining/offlining is not a change of the memory map.
>>>
>>> Phrasing it that way is non-sense.  What is important is memory
>>> available in the system.  A memory map is just a reflection upon that,
>>> a memory map is not the definition of truth.
>>>
>>> So if this notifier reflects when memory is coming and going on the
>>> system this is a reasonable approach.  
>>>
>>> Do these notifiers might fire for special kinds of memory that should
>>> only be used for very special purposes?
>>>
>>> This change with the addition of some filters say to limit taking action
>>> to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
>>> filtering out special kinds of memory that is not gernally useful.
>>
>> There are cases, where this notifier will not get called (e.g., hotplug
>> a DIMM and don't online it) or will get called, although nothing changed
>> (offline+re-online to a different zone triggered by user space). AFAIK,
>> nothing in kexec (*besides kdump) cares about online vs. offline memory.
>> This is why this feels wrong.
>>
>> add_memory()/try_remove_memory() is the place where:
>> - Memblocks are created/deleted (if the memblock allocator is still
>>   alive)
>> - Memory resources are created/deleted (e.g., reflected in /proc/iomem)
>> - Firmware memmap entries are created/deleted (/sys/firmware/memmap)
>>
>> My idea would be to add something like
>> kexec_map_add()/kexec_map_remove() where we have
>> firmware_map_add_hotplug()/firmware_map_remove(). From there, we can
>> unload the kexec image like done in this patch.
> 
> Hi David,
> 
> I may miss some details, do you know why we have to unload the kexec image
> when add/remove memory?
> 
> If this is applied, even kexec_file_load is also affected. As we
> discussed, kexec_file_load is not impacted by kinds of memory
> adding/removing at all.

kexec_load():

1. kexec-tools could have placed kexec images on memory that will be
removed.

2. the memory map of the guest is stale (esp., might still contain
hotunplugged memory). /sys/firmware/memmap and /proc/iomem will be
updated, so kexec-tools can fix this up.


kexec_file_load():

1. kexec could have placed kexec images on memory that will be removed,
especially when kexec_locate_mem_hole() is called to locate memory top-down.

IIRC, the memory map might also be stale and I agree that unloading
won't actually change something here (needs different fixes as I
explained regarding the kexec e820 map). Think about unplugging a DIMM
that was described in the e820 map during boot and was put into the
MOVABLE zone using cmdline parameters like "movablecore". After unplug,
it will still be described in the kexec e820 map.

I agree that we might might be able to make smarter decisions in the
kernel regarding kexec_file_load() - for example, try to find new
locations for kexec images. For now, this seems to be simple.

> 
> Besides, if unload image in casae memory added/removed, we will accept
> that the later 'kexec -e' is actually rebooting?

At least in the kernel, kernel_kexec() will bail out in case there is no
kexec_image loaded anymore. And we printed a message, so we can at least
figure out what happened.

Where is this rebooting you mention performed in case there is no image
loaded?
Eric W. Biederman May 11, 2020, 5:05 p.m. UTC | #7
David Hildenbrand <david@redhat.com> writes:

> On 09.05.20 17:14, Eric W. Biederman wrote:
>> David Hildenbrand <david@redhat.com> writes:
>> 
>>> On 01.05.20 18:57, James Morse wrote:
>>>> On x86, the kexec payload contains a copy of the current memory map.
>>>> If memory is added or removed, this copy of the memory map becomes
>>>> stale. Getting this wrong may prevent the next kernel from booting.
>>>> The first kernel may die if it tries to re-assemble the next kernel
>>>> in memory that has been removed.
>>>>
>>>> Discard the loaded kexec image when the memory map changes, user-space
>>>> should reload it.
>>>>
>>>> Kdump is unaffected, as it is placed within the crashkernel reserved
>>>> memory area and only uses this memory. The stale memory map may affect
>>>> generation of the vmcore, but the kdump kernel should be in a position
>>>> to validate it.
>>>>
>>>> Signed-off-by: James Morse <james.morse@arm.com>
>>>> ---
>>>> This patch obsoletes:
>>>>  * kexec/memory_hotplug: Prevent removal and accidental use
>>>> https://lore.kernel.org/linux-arm-kernel/20200326180730.4754-1-james.morse@arm.com/
>>>>
>>>>  kernel/kexec_core.c | 40 ++++++++++++++++++++++++++++++++++++++++
>>>>  1 file changed, 40 insertions(+)
>>>>
>>>> diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
>>>> index c19c0dad1ebe..e1901e5bd4b5 100644
>>>> --- a/kernel/kexec_core.c
>>>> +++ b/kernel/kexec_core.c
>>>> @@ -12,6 +12,7 @@
>>>>  #include <linux/slab.h>
>>>>  #include <linux/fs.h>
>>>>  #include <linux/kexec.h>
>>>> +#include <linux/memory.h>
>>>>  #include <linux/mutex.h>
>>>>  #include <linux/list.h>
>>>>  #include <linux/highmem.h>
>>>> @@ -22,10 +23,12 @@
>>>>  #include <linux/elf.h>
>>>>  #include <linux/elfcore.h>
>>>>  #include <linux/utsname.h>
>>>> +#include <linux/notifier.h>
>>>>  #include <linux/numa.h>
>>>>  #include <linux/suspend.h>
>>>>  #include <linux/device.h>
>>>>  #include <linux/freezer.h>
>>>> +#include <linux/pfn.h>
>>>>  #include <linux/pm.h>
>>>>  #include <linux/cpu.h>
>>>>  #include <linux/uaccess.h>
>>>> @@ -1219,3 +1222,40 @@ void __weak arch_kexec_protect_crashkres(void)
>>>>  
>>>>  void __weak arch_kexec_unprotect_crashkres(void)
>>>>  {}
>>>> +
>>>> +/*
>>>> + * If the memory layout changes, any loaded kexec image should be evicted
>>>> + * as it may contain a copy of the (now stale) memory map. This also means
>>>> + * we don't need to check the memory is still present when re-assembling the
>>>> + * new kernel at machine_kexec() time.
>>>> + */
>>>
>>> Onlining/offlining is not a change of the memory map.
>> 
>> Phrasing it that way is non-sense.  What is important is memory
>> available in the system.  A memory map is just a reflection upon that,
>> a memory map is not the definition of truth.
>> 
>> So if this notifier reflects when memory is coming and going on the
>> system this is a reasonable approach.  
>> 
>> Do these notifiers might fire for special kinds of memory that should
>> only be used for very special purposes?
>> 
>> This change with the addition of some filters say to limit taking action
>> to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
>> filtering out special kinds of memory that is not gernally useful.
>
> There are cases, where this notifier will not get called (e.g., hotplug
> a DIMM and don't online it) or will get called, although nothing changed
> (offline+re-online to a different zone triggered by user space). AFAIK,
> nothing in kexec (*besides kdump) cares about online vs. offline memory.
> This is why this feels wrong.

So what precisely does offline and online of memory mean in this context?
Is it turning the memory on and off?  (which is the obvious meaning)
Or is offline and online letting the ordinary kernel use a chunk
of memory and not use a chunk of memory and the memory remains running
the entire time?


> add_memory()/try_remove_memory() is the place where:
> - Memblocks are created/deleted (if the memblock allocator is still
>   alive)
> - Memory resources are created/deleted (e.g., reflected in /proc/iomem)
> - Firmware memmap entries are created/deleted (/sys/firmware/memmap)
>
> My idea would be to add something like
> kexec_map_add()/kexec_map_remove() where we have
> firmware_map_add_hotplug()/firmware_map_remove(). From there, we can
> unload the kexec image like done in this patch.

I don't see the connection with a firmware_map.  Maybe that is how it is
thought about in the code but in principle the firmware can not exist
or completely ignore memory hotplug.

> And these callbacks might come in handy for fixing up the kexec initial
> memmap in case of kexec_file_load(). AFAIKS on x86_64:

Maybe we have enough information to fixup the loaded kexec image
in the kexec_file_load case, we certainly don't in the ordinary
kexec_load case.

For now I want to stick to the simplest thing we can do which is either
blocking the memory hotplug operation (if that is possible) or
dropping the loaded kexec image.

If that actually becomes a problem in practice we can do something more.
But for now let's just do the minimal we can do that will prevent
incorrect operation.

Eric
David Hildenbrand May 12, 2020, 7:45 a.m. UTC | #8
>>> Phrasing it that way is non-sense.  What is important is memory
>>> available in the system.  A memory map is just a reflection upon that,
>>> a memory map is not the definition of truth.
>>>
>>> So if this notifier reflects when memory is coming and going on the
>>> system this is a reasonable approach.  
>>>
>>> Do these notifiers might fire for special kinds of memory that should
>>> only be used for very special purposes?
>>>
>>> This change with the addition of some filters say to limit taking action
>>> to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
>>> filtering out special kinds of memory that is not gernally useful.
>>
>> There are cases, where this notifier will not get called (e.g., hotplug
>> a DIMM and don't online it) or will get called, although nothing changed
>> (offline+re-online to a different zone triggered by user space). AFAIK,
>> nothing in kexec (*besides kdump) cares about online vs. offline memory.
>> This is why this feels wrong.
> 
> So what precisely does offline and online of memory mean in this context?
> Is it turning the memory on and off?  (which is the obvious meaning)
> Or is offline and online letting the ordinary kernel use a chunk
> of memory and not use a chunk of memory and the memory remains running
> the entire time?
> 

A DIMM is partitioned into fixed-size memory blocks. Each memory block
is represented in /sys/device/system/memory/memoryX/.

There, it can be onlined of offlined. onlining/offlining a memory block
simply defines
- if the memory will be used by the buddy
- how the memory will be used by the buddy (e.g., ZONE_NORMAL vs.
  ZONE_MOVABLE)
nothing else (esp. no hardware is switched on/off).

e.g.,

echo "online_movable" > /sys/devices/system/memory/memory9/state
echo "offline" > /sys/devices/system/memory/memory9/state
echo "online_kernel" > /sys/devices/system/memory/memory9/state

When hotplugging memory, all memory blocks are either onlined directly
from the kernel, or userspace has to do it manually via e.g., udev
rules. The latter is common is distributions.

Before hotunplugging memory, all memory blocks have to be offline. This
means
- memory was never onlined
- memory was offlined by user space manually
- memory will be offlined automatically when unplugging the dimm

Of course, offlining of some memory blocks might fail (esp. in case of
ZONE_NORMAL when they contain unmovable allocations). Then, the memory
cannot get hotunplugged.

The representation in /proc/iomem and /sys/firmware/memmap is
independent of the state (online/offline) of a memory block.

> 
>> add_memory()/try_remove_memory() is the place where:
>> - Memblocks are created/deleted (if the memblock allocator is still
>>   alive)
>> - Memory resources are created/deleted (e.g., reflected in /proc/iomem)
>> - Firmware memmap entries are created/deleted (/sys/firmware/memmap)
>>
>> My idea would be to add something like
>> kexec_map_add()/kexec_map_remove() where we have
>> firmware_map_add_hotplug()/firmware_map_remove(). From there, we can
>> unload the kexec image like done in this patch.
> 
> I don't see the connection with a firmware_map.  Maybe that is how it is
> thought about in the code but in principle the firmware can not exist
> or completely ignore memory hotplug.

The firmware_map callbacks simply update /sys/firmware/memmap in case
that interface is configured into the kernel (mostly x86 only), nothing
else. We just want similar callbacks to update kexec' representation.

> 
>> And these callbacks might come in handy for fixing up the kexec initial
>> memmap in case of kexec_file_load(). AFAIKS on x86_64:
> 
> Maybe we have enough information to fixup the loaded kexec image
> in the kexec_file_load case, we certainly don't in the ordinary
> kexec_load case.

Yes, that's also what I mentioned in my reply to Baoquan.

> 
> For now I want to stick to the simplest thing we can do which is either
> blocking the memory hotplug operation (if that is possible) or
> dropping the loaded kexec image.

Yes, the latter is the best for now. It's simple.

I am suggesting to add explicit callbacks to
add_memory()/remove_memory(), and calling the invalidation from there -
because I see various issues with the memory notifier approach (racy,
false positives, never called if memory is not onlined).
Baoquan He May 12, 2020, 10:34 a.m. UTC | #9
On 05/11/20 at 01:55pm, David Hildenbrand wrote:
> On 11.05.20 13:27, Baoquan He wrote:
> > On 05/11/20 at 10:19am, David Hildenbrand wrote:
> >> On 09.05.20 17:14, Eric W. Biederman wrote:
> >>>>> + * If the memory layout changes, any loaded kexec image should be evicted
> >>>>> + * as it may contain a copy of the (now stale) memory map. This also means
> >>>>> + * we don't need to check the memory is still present when re-assembling the
> >>>>> + * new kernel at machine_kexec() time.
> >>>>> + */
> >>>>
> >>>> Onlining/offlining is not a change of the memory map.
> >>>
> >>> Phrasing it that way is non-sense.  What is important is memory
> >>> available in the system.  A memory map is just a reflection upon that,
> >>> a memory map is not the definition of truth.
> >>>
> >>> So if this notifier reflects when memory is coming and going on the
> >>> system this is a reasonable approach.  
> >>>
> >>> Do these notifiers might fire for special kinds of memory that should
> >>> only be used for very special purposes?
> >>>
> >>> This change with the addition of some filters say to limit taking action
> >>> to MEM_ONLINE and MEM_OFFLINE looks reasonable to me.  Probably also
> >>> filtering out special kinds of memory that is not gernally useful.
> >>
> >> There are cases, where this notifier will not get called (e.g., hotplug
> >> a DIMM and don't online it) or will get called, although nothing changed
> >> (offline+re-online to a different zone triggered by user space). AFAIK,
> >> nothing in kexec (*besides kdump) cares about online vs. offline memory.
> >> This is why this feels wrong.
> >>
> >> add_memory()/try_remove_memory() is the place where:
> >> - Memblocks are created/deleted (if the memblock allocator is still
> >>   alive)
> >> - Memory resources are created/deleted (e.g., reflected in /proc/iomem)
> >> - Firmware memmap entries are created/deleted (/sys/firmware/memmap)
> >>
> >> My idea would be to add something like
> >> kexec_map_add()/kexec_map_remove() where we have
> >> firmware_map_add_hotplug()/firmware_map_remove(). From there, we can
> >> unload the kexec image like done in this patch.
> > 
> > Hi David,
> > 
> > I may miss some details, do you know why we have to unload the kexec image
> > when add/remove memory?
> > 
> > If this is applied, even kexec_file_load is also affected. As we
> > discussed, kexec_file_load is not impacted by kinds of memory
> > adding/removing at all.
> 
> kexec_load():
> 
> 1. kexec-tools could have placed kexec images on memory that will be
> removed.
> 
> 2. the memory map of the guest is stale (esp., might still contain
> hotunplugged memory). /sys/firmware/memmap and /proc/iomem will be
> updated, so kexec-tools can fix this up.

With my understanding, this is a corner case. Before James's last
patchset, I even hadn't realized this is a problem. Because we usually
load kexec image, next trigger a kexec rebooting. Wondering if James
just found out a potential issue, or he really met this problem. Surely,
we should fix it when have identified it, even though it's a corner
case.

And we suggested adding service of loading kexec to fix this. We
suggest this because kdump also need to recollect the memory regions
so that it can pass them into 2nd kernel and dump the newly added
memory region, or not dump the already removed memory region. 
Kdump kernel won't get problem during boot or running caused by the
hot added/removed memory as kexec kernel does, however, on failing to
achieve expected result, kdump and kexec have the same problem. I don't
see why kdump can be reloaded by memory adding/removing uevent triggering,
but kexec can't. If have to unload kexec image, does kdump image need
be unloaded?

Here my main concern is if it will complicate kexec code. While
reloading it via systemd service won't. No matther if it's making kexec
disable memory hotplug, or making memory hotplug disabling kexec, it
seems to couple kexec with other feature/subcomponent. Anyway, we have
added a kexec loading service, any memory adding/removing uevent will
trigger the reloading. This patch won't impact anything, even though
it doesn't make sense to us, so have no objection to this.

Another thing is below patch. Another case of complicating kexec because
of specific use case, please feel free to help review and add comment.
I am wondering if we can make it in user space too. E.g for oracle DB,
we limit the memory allocation within the movable nodes for memory
hotplugging, we can also add memmap= or mem= to kexec-ed kernel to protect
those memory regions inside the nodes, then restore the data from the nodes.
Not sure if VM data can be put in MOVABLE zone only.

[RFC 00/43] PKRAM: Preserved-over-Kexec RAM

> kexec_file_load():
> 
> 1. kexec could have placed kexec images on memory that will be removed,
> especially when kexec_locate_mem_hole() is called to locate memory top-down.
> 
> IIRC, the memory map might also be stale and I agree that unloading
> won't actually change something here (needs different fixes as I
> explained regarding the kexec e820 map). Think about unplugging a DIMM
> that was described in the e820 map during boot and was put into the
> MOVABLE zone using cmdline parameters like "movablecore". After unplug,
> it will still be described in the kexec e820 map.

Yes, this is a good catch. I thought to leave the e820_table_kexec
as is. As for the boot memory hotplug as you mentioned, it's a problem.
We can't tell kexec-ed kernel an unavailable region via e820. Once
updating e820_table_kexec, kexec_file_load will not be immune to
hotplugged memory any more. Otherwise the stale e820 map will pass to
kexec kernel, I haven't checked if it will impact system booting, will
check.

> 
> I agree that we might might be able to make smarter decisions in the
> kernel regarding kexec_file_load() - for example, try to find new
> locations for kexec images. For now, this seems to be simple.
> 
> > 
> > Besides, if unload image in casae memory added/removed, we will accept
> > that the later 'kexec -e' is actually rebooting?
> 
> At least in the kernel, kernel_kexec() will bail out in case there is no
> kexec_image loaded anymore. And we printed a message, so we can at least
> figure out what happened.
> 
> Where is this rebooting you mention performed in case there is no image
> loaded?

OK, I forgot it returned from reboot invocation w/o image loaded.

> 
> -- 
> Thanks,
> 
> David / dhildenb
David Hildenbrand May 12, 2020, 10:54 a.m. UTC | #10
>> kexec_load():
>>
>> 1. kexec-tools could have placed kexec images on memory that will be
>> removed.
>>
>> 2. the memory map of the guest is stale (esp., might still contain
>> hotunplugged memory). /sys/firmware/memmap and /proc/iomem will be
>> updated, so kexec-tools can fix this up.
> 
> With my understanding, this is a corner case. Before James's last
> patchset, I even hadn't realized this is a problem. Because we usually
> load kexec image, next trigger a kexec rebooting. Wondering if James
> just found out a potential issue, or he really met this problem. Surely,

Should be as easy as hotplugging a dimm, loading "kexec -c", unplugging
the dimm, triggering "kexec -e" if I am not wrong.

> we should fix it when have identified it, even though it's a corner
> case.
> 
> And we suggested adding service of loading kexec to fix this. We
> suggest this because kdump also need to recollect the memory regions
> so that it can pass them into 2nd kernel and dump the newly added
> memory region, or not dump the already removed memory region. 
> Kdump kernel won't get problem during boot or running caused by the
> hot added/removed memory as kexec kernel does, however, on failing to
> achieve expected result, kdump and kexec have the same problem. I don't
> see why kdump can be reloaded by memory adding/removing uevent triggering,
> but kexec can't. If have to unload kexec image, does kdump image need
> be unloaded?

I think that approach is racy and might easily trigger a crash when
"kexec -e" is called at the wrong time during memory unplug. See below
why kdump is different. Triggering unloading in the kernel does not
conflict with that approach and even seems to fit into the picture, no?

1. Memory gets hot(un)plugged
2. The kernel unloads the kexec image while processing the hot(un)plug
   to make sure nothing will go wrong.
3. User space gets notified and triggers reloading of kexec.

That sounds like a sane approach to me, no? If there is no 3., nothing
will break. If there is a "kexec -e" before 3 finished, nothing will
break. As we discussed, we might be able to special-case
kexec_file_load() and not unload, but simply fixup.

Note that kdump is slightly different. In case memory gets hotplugged
and kdump is not reloaded, that memory will simply not get dumped. In
case memory gets hotunplugged and kdump is not reloaded, that memory
will be skipped by makedumpfile (realizes memory is gone when parsing
the sparse sections, trying to find the memmap). In contrast to kexec,
there is no kernel crash.

> 
> Here my main concern is if it will complicate kexec code. While
> reloading it via systemd service won't. No matther if it's making kexec
> disable memory hotplug, or making memory hotplug disabling kexec, it
> seems to couple kexec with other feature/subcomponent. Anyway, we have
> added a kexec loading service, any memory adding/removing uevent will
> trigger the reloading. This patch won't impact anything, even though
> it doesn't make sense to us, so have no objection to this.

I don't consider unloading in the kernel a lot of complexity. And it
seems to be the right thing to do to avoid crashes, especially if user
space will not reload itself.

> 
> Another thing is below patch. Another case of complicating kexec because
> of specific use case, please feel free to help review and add comment.
> I am wondering if we can make it in user space too. E.g for oracle DB,
> we limit the memory allocation within the movable nodes for memory
> hotplugging, we can also add memmap= or mem= to kexec-ed kernel to protect
> those memory regions inside the nodes, then restore the data from the nodes.
> Not sure if VM data can be put in MOVABLE zone only.
> 
> [RFC 00/43] PKRAM: Preserved-over-Kexec RAM

I've seen that patch set and it is on my todo list, not sure when I'll
have time to look into it. From a quick glimpse, I had the feeling that
it was not dealing with memory hot(un)plug, most probably because
concurrent memory hot(un)plug is not the target use case.
Eric W. Biederman May 12, 2020, 11:59 a.m. UTC | #11
David Hildenbrand <david@redhat.com> writes:
>> 
>> Maybe we have enough information to fixup the loaded kexec image
>> in the kexec_file_load case, we certainly don't in the ordinary
>> kexec_load case.
>
> Yes, that's also what I mentioned in my reply to Baoquan.
>
>> 
>> For now I want to stick to the simplest thing we can do which is either
>> blocking the memory hotplug operation (if that is possible) or
>> dropping the loaded kexec image.
>
> Yes, the latter is the best for now. It's simple.
>
> I am suggesting to add explicit callbacks to
> add_memory()/remove_memory(), and calling the invalidation from there -
> because I see various issues with the memory notifier approach (racy,
> false positives, never called if memory is not onlined).

Ok so we are in agreement.  Correct patch.  Wrong trigger condition.

Eric
Baoquan He May 12, 2020, 2:11 p.m. UTC | #12
On 05/12/20 at 12:54pm, David Hildenbrand wrote:
> >> kexec_load():
> >>
> >> 1. kexec-tools could have placed kexec images on memory that will be
> >> removed.
> >>
> >> 2. the memory map of the guest is stale (esp., might still contain
> >> hotunplugged memory). /sys/firmware/memmap and /proc/iomem will be
> >> updated, so kexec-tools can fix this up.
> > 
> > With my understanding, this is a corner case. Before James's last
> > patchset, I even hadn't realized this is a problem. Because we usually
> > load kexec image, next trigger a kexec rebooting. Wondering if James
> > just found out a potential issue, or he really met this problem. Surely,
> 
> Should be as easy as hotplugging a dimm, loading "kexec -c", unplugging
> the dimm, triggering "kexec -e" if I am not wrong.

Hmm, kexec rebooting is also one kind of rebooting, we may not want to
hot plug memory during that time. But, yes, just in case.

> 
> > we should fix it when have identified it, even though it's a corner
> > case.
> > 
> > And we suggested adding service of loading kexec to fix this. We
> > suggest this because kdump also need to recollect the memory regions
> > so that it can pass them into 2nd kernel and dump the newly added
> > memory region, or not dump the already removed memory region. 
> > Kdump kernel won't get problem during boot or running caused by the
> > hot added/removed memory as kexec kernel does, however, on failing to
> > achieve expected result, kdump and kexec have the same problem. I don't
> > see why kdump can be reloaded by memory adding/removing uevent triggering,
> > but kexec can't. If have to unload kexec image, does kdump image need
> > be unloaded?
> 
> I think that approach is racy and might easily trigger a crash when
> "kexec -e" is called at the wrong time during memory unplug. See below
> why kdump is different. Triggering unloading in the kernel does not
> conflict with that approach and even seems to fit into the picture, no?
> 
> 1. Memory gets hot(un)plugged
> 2. The kernel unloads the kexec image while processing the hot(un)plug
>    to make sure nothing will go wrong.
> 3. User space gets notified and triggers reloading of kexec.
> 
> That sounds like a sane approach to me, no? If there is no 3., nothing
> will break. If there is a "kexec -e" before 3 finished, nothing will
> break. As we discussed, we might be able to special-case
> kexec_file_load() and not unload, but simply fixup.
> 
> Note that kdump is slightly different. In case memory gets hotplugged
> and kdump is not reloaded, that memory will simply not get dumped. In
> case memory gets hotunplugged and kdump is not reloaded, that memory
> will be skipped by makedumpfile (realizes memory is gone when parsing
> the sparse sections, trying to find the memmap). In contrast to kexec,
> there is no kernel crash.
> 
> > 
> > Here my main concern is if it will complicate kexec code. While
> > reloading it via systemd service won't. No matther if it's making kexec
> > disable memory hotplug, or making memory hotplug disabling kexec, it
> > seems to couple kexec with other feature/subcomponent. Anyway, we have
> > added a kexec loading service, any memory adding/removing uevent will
> > trigger the reloading. This patch won't impact anything, even though
> > it doesn't make sense to us, so have no objection to this.
> 
> I don't consider unloading in the kernel a lot of complexity. And it
> seems to be the right thing to do to avoid crashes, especially if user
> space will not reload itself.
> 
> > 
> > Another thing is below patch. Another case of complicating kexec because
> > of specific use case, please feel free to help review and add comment.
> > I am wondering if we can make it in user space too. E.g for oracle DB,
> > we limit the memory allocation within the movable nodes for memory
> > hotplugging, we can also add memmap= or mem= to kexec-ed kernel to protect
> > those memory regions inside the nodes, then restore the data from the nodes.
> > Not sure if VM data can be put in MOVABLE zone only.
> > 
> > [RFC 00/43] PKRAM: Preserved-over-Kexec RAM
> 
> I've seen that patch set and it is on my todo list, not sure when I'll
> have time to look into it. From a quick glimpse, I had the feeling that
> it was not dealing with memory hot(un)plug, most probably because
> concurrent memory hot(un)plug is not the target use case.

Not, it's not about hot plug. Hope you can help check if restoring VM data in
kexec-ed kernel have to be done like that from virt dev's point of view.
Please feel free to add other virt expert you know who is familiar with that
to the list to help review.
diff mbox series

Patch

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..e1901e5bd4b5 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -12,6 +12,7 @@ 
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/kexec.h>
+#include <linux/memory.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
@@ -22,10 +23,12 @@ 
 #include <linux/elf.h>
 #include <linux/elfcore.h>
 #include <linux/utsname.h>
+#include <linux/notifier.h>
 #include <linux/numa.h>
 #include <linux/suspend.h>
 #include <linux/device.h>
 #include <linux/freezer.h>
+#include <linux/pfn.h>
 #include <linux/pm.h>
 #include <linux/cpu.h>
 #include <linux/uaccess.h>
@@ -1219,3 +1222,40 @@  void __weak arch_kexec_protect_crashkres(void)
 
 void __weak arch_kexec_unprotect_crashkres(void)
 {}
+
+/*
+ * If the memory layout changes, any loaded kexec image should be evicted
+ * as it may contain a copy of the (now stale) memory map. This also means
+ * we don't need to check the memory is still present when re-assembling the
+ * new kernel at machine_kexec() time.
+ */
+static int mem_change_cb(struct notifier_block *nb, unsigned long action,
+			 void *data)
+{
+	/*
+	 * Actions are either a change, or a change being cancelled.
+	 * A second discard for 'cancel's is harmless.
+	 */
+
+	mutex_lock(&kexec_mutex);
+	if (kexec_image) {
+		kimage_free(xchg(&kexec_image, NULL));
+		pr_warn("loaded image discarded due to memory hotplug");
+	}
+	mutex_unlock(&kexec_mutex);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block mem_change_nb = {
+	.notifier_call = mem_change_cb,
+};
+
+static int __init register_mem_change_cb(void)
+{
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG))
+		return register_memory_notifier(&mem_change_nb);
+
+	return 0;
+}
+device_initcall(register_mem_change_cb);