diff mbox series

[RFC] mm: Avoid triggering oom-killer during memory hot-remove operations

Message ID 20240726084456.1309928-1-lizhijian@fujitsu.com (mailing list archive)
State New
Headers show
Series [RFC] mm: Avoid triggering oom-killer during memory hot-remove operations | expand

Commit Message

Zhijian Li (Fujitsu) July 26, 2024, 8:44 a.m. UTC
When a process is bound to a node that is being hot-removed, any memory
allocation attempts from that node should fail gracefully without
triggering the OOM-killer. However, the current behavior can cause the
oom-killer to be invoked, leading to the termination of processes on other
nodes, even when there is sufficient memory available in the system.

Prevent the oom-killer from being triggered by processes bound to a
node undergoing hot-remove operations. Instead, the allocation attempts
from the offlining node will simply fail, allowing the process to handle
the failure appropriately without causing disruption to the system.

Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
---
 include/linux/memory_hotplug.h |  6 ++++++
 mm/memory_hotplug.c            | 21 +++++++++++++++++++++
 mm/page_alloc.c                |  6 ++++++
 3 files changed, 33 insertions(+)

Comments

Michal Hocko July 26, 2024, 9:17 a.m. UTC | #1
On Fri 26-07-24 16:44:56, Li Zhijian wrote:
> When a process is bound to a node that is being hot-removed, any memory
> allocation attempts from that node should fail gracefully without
> triggering the OOM-killer. However, the current behavior can cause the
> oom-killer to be invoked, leading to the termination of processes on other
> nodes, even when there is sufficient memory available in the system.

But you said they are bound to the node that is offlined.
 
> Prevent the oom-killer from being triggered by processes bound to a
> node undergoing hot-remove operations. Instead, the allocation attempts
> from the offlining node will simply fail, allowing the process to handle
> the failure appropriately without causing disruption to the system.

NAK.

Also it is not really clear why process of offlining should behave any
different from after the node is offlined. Could you describe an actual
problem you are facing with much more details please?
 
> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
> ---
>  include/linux/memory_hotplug.h |  6 ++++++
>  mm/memory_hotplug.c            | 21 +++++++++++++++++++++
>  mm/page_alloc.c                |  6 ++++++
>  3 files changed, 33 insertions(+)
> 
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 7a9ff464608d..0ca804215e11 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -332,6 +332,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  extern int remove_memory(u64 start, u64 size);
>  extern void __remove_memory(u64 start, u64 size);
>  extern int offline_and_remove_memory(u64 start, u64 size);
> +bool is_offlining_node(nodemask_t nodes);
>  
>  #else
>  static inline void try_offline_node(int nid) {}
> @@ -348,6 +349,11 @@ static inline int remove_memory(u64 start, u64 size)
>  }
>  
>  static inline void __remove_memory(u64 start, u64 size) {}
> +
> +static inline bool is_offlining_node(nodemask_t nodes)
> +{
> +	return false;
> +}
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 431b1f6753c0..da3982751ba9 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1938,6 +1938,22 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
>  	return 0;
>  }
>  
> +static nodemask_t offlining_node = NODE_MASK_NONE;
> +
> +bool is_offlining_node(nodemask_t nodes)
> +{
> +	return nodes_equal(offlining_node, nodes);
> +}
> +
> +static void offline_pages_start(int node)
> +{
> +	node_set(node, offlining_node);
> +}
> +
> +static void offline_pages_end(void)
> +{
> +	offlining_node = NODE_MASK_NONE;
> +}
>  /*
>   * Must be called with mem_hotplug_lock in write mode.
>   */
> @@ -1991,6 +2007,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  		goto failed_removal;
>  	}
>  
> +	offline_pages_start(node);
>  	/*
>  	 * Disable pcplists so that page isolation cannot race with freeing
>  	 * in a way that pages from isolated pageblock are left on pcplists.
> @@ -2107,6 +2124,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  
>  	memory_notify(MEM_OFFLINE, &arg);
>  	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
> +	offline_pages_end();
> +
>  	return 0;
>  
>  failed_removal_isolated:
> @@ -2121,6 +2140,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>  		 (unsigned long long) start_pfn << PAGE_SHIFT,
>  		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
>  		 reason);
> +
> +	offline_pages_end();
>  	return ret;
>  }
>  
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 1780df31d5f5..acdab6b114a5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -3563,6 +3563,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
>  	if (page)
>  		goto out;
>  
> +	/* hot-remove is on-going, it generally fails to allocate memory from
> +	 * the being removed memory node. Leave it alone.
> +	 */
> +	if (is_offlining_node(*ac->nodemask))
> +		goto out;
> +
>  	/* Coredumps can quickly deplete all memory reserves */
>  	if (current->flags & PF_DUMPCORE)
>  		goto out;
> -- 
> 2.29.2
>
Zhijian Li (Fujitsu) July 29, 2024, 12:37 a.m. UTC | #2
Michal,

Sorry to the late reply.


On 26/07/2024 17:17, Michal Hocko wrote:
> On Fri 26-07-24 16:44:56, Li Zhijian wrote:
>> When a process is bound to a node that is being hot-removed, any memory
>> allocation attempts from that node should fail gracefully without
>> triggering the OOM-killer. However, the current behavior can cause the
>> oom-killer to be invoked, leading to the termination of processes on other
>> nodes, even when there is sufficient memory available in the system.
> 
> But you said they are bound to the node that is offlined.
>   
>> Prevent the oom-killer from being triggered by processes bound to a
>> node undergoing hot-remove operations. Instead, the allocation attempts
>> from the offlining node will simply fail, allowing the process to handle
>> the failure appropriately without causing disruption to the system.
> 
> NAK.
> 
> Also it is not really clear why process of offlining should behave any
> different from after the node is offlined. Could you describe an actual
> problem you are facing with much more details please?

We encountered that some processes(including some system critical services, for example sshd, rsyslogd, login)
were killed during our memory hot-remove testing. Our test program are described previous mail[1]

In short, we have 3 memory nodes, node0 and node1 are DRAM, while node2 is CXL volatile memory that is onlined
to ZONE_MOVABLE. When we attempted to remove the node2, oom-killed was invoked to kill other processes
(sshd, rsyslogd, login) even though there is enough memory on node0+node1.

This oom-killed was triggered by allocating memory path of our own testing process which was bound to node2.

So I expect,
- our own tes process failed to allocate memory from node2 which is being hot-removed is acceptable.
- oom-killer should not be invoked to kill processes other than running on node2.


[1] https://lore.kernel.org/linux-mm/6a07125f-e720-404c-b2f9-e55f3f166e85@fujitsu.com/


>   
>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>> ---
>>   include/linux/memory_hotplug.h |  6 ++++++
>>   mm/memory_hotplug.c            | 21 +++++++++++++++++++++
>>   mm/page_alloc.c                |  6 ++++++
>>   3 files changed, 33 insertions(+)
>>
>> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>> index 7a9ff464608d..0ca804215e11 100644
>> --- a/include/linux/memory_hotplug.h
>> +++ b/include/linux/memory_hotplug.h
>> @@ -332,6 +332,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   extern int remove_memory(u64 start, u64 size);
>>   extern void __remove_memory(u64 start, u64 size);
>>   extern int offline_and_remove_memory(u64 start, u64 size);
>> +bool is_offlining_node(nodemask_t nodes);
>>   
>>   #else
>>   static inline void try_offline_node(int nid) {}
>> @@ -348,6 +349,11 @@ static inline int remove_memory(u64 start, u64 size)
>>   }
>>   
>>   static inline void __remove_memory(u64 start, u64 size) {}
>> +
>> +static inline bool is_offlining_node(nodemask_t nodes)
>> +{
>> +	return false;
>> +}
>>   #endif /* CONFIG_MEMORY_HOTREMOVE */
>>   
>>   #ifdef CONFIG_MEMORY_HOTPLUG
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 431b1f6753c0..da3982751ba9 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1938,6 +1938,22 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
>>   	return 0;
>>   }
>>   
>> +static nodemask_t offlining_node = NODE_MASK_NONE;
>> +
>> +bool is_offlining_node(nodemask_t nodes)
>> +{
>> +	return nodes_equal(offlining_node, nodes);
>> +}
>> +
>> +static void offline_pages_start(int node)
>> +{
>> +	node_set(node, offlining_node);
>> +}
>> +
>> +static void offline_pages_end(void)
>> +{
>> +	offlining_node = NODE_MASK_NONE;
>> +}
>>   /*
>>    * Must be called with mem_hotplug_lock in write mode.
>>    */
>> @@ -1991,6 +2007,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   		goto failed_removal;
>>   	}
>>   
>> +	offline_pages_start(node);
>>   	/*
>>   	 * Disable pcplists so that page isolation cannot race with freeing
>>   	 * in a way that pages from isolated pageblock are left on pcplists.
>> @@ -2107,6 +2124,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   
>>   	memory_notify(MEM_OFFLINE, &arg);
>>   	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
>> +	offline_pages_end();
>> +
>>   	return 0;
>>   
>>   failed_removal_isolated:
>> @@ -2121,6 +2140,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>   		 (unsigned long long) start_pfn << PAGE_SHIFT,
>>   		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
>>   		 reason);
>> +
>> +	offline_pages_end();
>>   	return ret;
>>   }
>>   
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 1780df31d5f5..acdab6b114a5 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -3563,6 +3563,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
>>   	if (page)
>>   		goto out;
>>   
>> +	/* hot-remove is on-going, it generally fails to allocate memory from
>> +	 * the being removed memory node. Leave it alone.
>> +	 */
>> +	if (is_offlining_node(*ac->nodemask))
>> +		goto out;
>> +
>>   	/* Coredumps can quickly deplete all memory reserves */
>>   	if (current->flags & PF_DUMPCORE)
>>   		goto out;
>> -- 
>> 2.29.2
>>
>
Zhijian Li (Fujitsu) July 29, 2024, 2:14 a.m. UTC | #3
On 29/07/2024 08:37, Li Zhijian wrote:
> Michal,
> 
> Sorry to the late reply.
> 
> 
> On 26/07/2024 17:17, Michal Hocko wrote:
>> On Fri 26-07-24 16:44:56, Li Zhijian wrote:
>>> When a process is bound to a node that is being hot-removed, any memory
>>> allocation attempts from that node should fail gracefully without
>>> triggering the OOM-killer. However, the current behavior can cause the
>>> oom-killer to be invoked, leading to the termination of processes on other
>>> nodes, even when there is sufficient memory available in the system.
>>
>> But you said they are bound to the node that is offlined.
>>> Prevent the oom-killer from being triggered by processes bound to a
>>> node undergoing hot-remove operations. Instead, the allocation attempts
>>> from the offlining node will simply fail, allowing the process to handle
>>> the failure appropriately without causing disruption to the system.
>>
>> NAK.
>>
>> Also it is not really clear why process of offlining should behave any
>> different from after the node is offlined. Could you describe an actual
>> problem you are facing with much more details please?
> 
> We encountered that some processes(including some system critical services, for example sshd, rsyslogd, login)
> were killed during our memory hot-remove testing. Our test program are described previous mail[1]
> 
> In short, we have 3 memory nodes, node0 and node1 are DRAM, while node2 is CXL volatile memory that is onlined
> to ZONE_MOVABLE. When we attempted to remove the node2, oom-killed was invoked to kill other processes
> (sshd, rsyslogd, login) even though there is enough memory on node0+node1.

Attached the dmesg as following:

[13853.707626] consume_std_pag invoked oom-killer: gfp_mask=0x140dca(GFP_HIGHUSER_MOVABLE|__GFP_COMP|__GFP_ZERO), order=0, oom_score_adj=0
[13853.708400] CPU: 1 PID: 274746 Comm: consume_std_pag Not tainted 6.10.0-rc2+ #160
[13853.708745] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[13853.709161] Call Trace:
[13853.709161]  <TASK>
[13853.709161]  dump_stack_lvl+0x64/0x80
[13853.709161]  dump_header+0x44/0x1a0
[13853.709161]  oom_kill_process+0xf8/0x200
[13853.709161]  out_of_memory+0x110/0x590
[13853.709161]  __alloc_pages_slowpath.constprop.92+0xb5f/0xd80
[13853.709161]  __alloc_pages_noprof+0x354/0x380
[13853.709161]  alloc_pages_mpol_noprof+0xe3/0x1f0
[13853.709161]  vma_alloc_folio_noprof+0x5c/0xb0
[13853.709161]  folio_prealloc+0x21/0x80
[13853.709161]  do_pte_missing+0x695/0xa20
[13853.709161]  ? __pte_offset_map+0x1b/0x180
[13853.709161]  __handle_mm_fault+0x65f/0xc10
[13853.709161]  ? sched_tick+0xd7/0x2b0
[13853.709161]  handle_mm_fault+0x128/0x360
[13853.709161]  do_user_addr_fault+0x309/0x810
[13853.709161]  exc_page_fault+0x7e/0x180
[13853.709161]  asm_exc_page_fault+0x26/0x30
[13853.709161] RIP: 0033:0x7f1d3ae2428a
[13853.709161] Code: c5 fe 7f 07 c5 fe 7f 47 20 c5 fe 7f 47 40 c5 fe 7f 47 60 c5 f8 77 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 fa <f3> aa 48 89 d0 c5 f8 77 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90
[13853.712991] RSP: 002b:00007ffe083a2388 EFLAGS: 00000206
[13853.712991] RAX: 0000000000000000 RBX: 00007ffe083a24d8 RCX: 0000000007eb8010
[13853.713915] Fallback order for Node 0: 0 1
[13853.713987] Fallback order for Node 1: 1 0
[13853.714006] Fallback order for Node 2: 0 1
[13853.714175] Built 3 zonelists, mobility grouping on.  Total pages: 2002419
[13853.712991] RDX: 00007f1d32c00010 RSI: 0000000000000000 RDI: 00007f1d32d48000
[13853.712991] RBP: 00007ffe083a23b0 R08: 00000000ffffffff R09: 0000000000000000
[13853.714439] Policy zone: Normal
[13853.712991] R10: 00007f1d3acd5200 R11: 00007f1d3ae241c0 R12: 0000000000000002
[13853.712991] R13: 0000000000000000 R14: 00007f1d3aeea000 R15: 0000000000403e00
[13853.712991]  </TASK>
[13853.716564] Mem-Info:
[13853.716688] active_anon:17939 inactive_anon:0 isolated_anon:0
[13853.716688]  active_file:132347 inactive_file:109560 isolated_file:0
[13853.716688]  unevictable:0 dirty:2021 writeback:0
[13853.716688]  slab_reclaimable:5876 slab_unreclaimable:18566
[13853.716688]  mapped:35589 shmem:251 pagetables:1809
[13853.716688]  sec_pagetables:0 bounce:0
[13853.716688]  kernel_misc_reclaimable:0
[13853.716688]  free:1694176 free_pcp:0 free_cma:0
[13853.718420] Node 2 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=1048576kB
[13853.718730] Node 2 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[13853.719127] 242158 total pagecache pages
[13853.719310] 0 pages in swap cache
[13853.719441] Free swap  = 8142844kB
[13853.719583] Total swap = 8143868kB
[13853.719731] 2097019 pages RAM
[13853.719890] 0 pages HighMem/MovableOnly
[13853.720155] 60814 pages reserved
[13853.720278] 0 pages cma reserved
[13853.720393] 0 pages hwpoisoned
[13853.720494] Tasks state (memory values in pages):
[13853.720686] [  pid  ]   uid  tgid total_vm      rss rss_anon rss_file rss_shmem pgtables_bytes swapents oom_score_adj name
[13853.721214] [    718]     0   718    40965    29598      256    29342         0   368640        0          -250 systemd-journal
<...snip...>
[13853.747190] [ 274715]     0 274715     8520     1731      879      852         0    73728        0             0 (udev-worker)
[13853.747561] [ 274743]     0 274743      617      384        0      384         0    45056        0             0 consume_activit
[13853.748099] [ 274744]     0 274744      617      281        0      281         0    45056        0             0 consume_activit
[13853.748479] [ 274745]     0 274745     2369      954      128      826         0    61440        0             0 daxctl
[13853.748885] [ 274746]     0 274746    33386      667      320      347         0    49152        0             0 consume_std_pag
<...snip...>
[13853.755653] [ 274808]     0 274808     3534      251       32      219         0    61440        0             0 systemctl
[13853.756151] oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=2,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/system.slice/rsyslog.service,task=rsyslogd,pid=274557,uid=0
[13853.756791] Out of memory: Killed process 274557 (rsyslogd) total-vm:957964kB, anon-rss:640kB, file-rss:46496kB, shmem-rss:0kB, UID:0 pgtables:1512kB oom_score_adj:0
[13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
[13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.758865] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.759319] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.759564] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.759779] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760128] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760361] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760588] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760794] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.761187] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.774166] Demotion targets for Node 0: null
[13853.774478] Demotion targets for Node 1: null



> 
> This oom-killed was triggered by allocating memory path of our own testing process which was bound to node2.
> 
> So I expect,
> - our own tes process failed to allocate memory from node2 which is being hot-removed is acceptable.
> - oom-killer should not be invoked to kill processes other than running on node2.
> 
> 
> [1] https://lore.kernel.org/linux-mm/6a07125f-e720-404c-b2f9-e55f3f166e85@fujitsu.com/
> 
> 
>>> Signed-off-by: Li Zhijian <lizhijian@fujitsu.com>
>>> ---
>>>   include/linux/memory_hotplug.h |  6 ++++++
>>>   mm/memory_hotplug.c            | 21 +++++++++++++++++++++
>>>   mm/page_alloc.c                |  6 ++++++
>>>   3 files changed, 33 insertions(+)
>>>
>>> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
>>> index 7a9ff464608d..0ca804215e11 100644
>>> --- a/include/linux/memory_hotplug.h
>>> +++ b/include/linux/memory_hotplug.h
>>> @@ -332,6 +332,7 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>>   extern int remove_memory(u64 start, u64 size);
>>>   extern void __remove_memory(u64 start, u64 size);
>>>   extern int offline_and_remove_memory(u64 start, u64 size);
>>> +bool is_offlining_node(nodemask_t nodes);
>>>   #else
>>>   static inline void try_offline_node(int nid) {}
>>> @@ -348,6 +349,11 @@ static inline int remove_memory(u64 start, u64 size)
>>>   }
>>>   static inline void __remove_memory(u64 start, u64 size) {}
>>> +
>>> +static inline bool is_offlining_node(nodemask_t nodes)
>>> +{
>>> +    return false;
>>> +}
>>>   #endif /* CONFIG_MEMORY_HOTREMOVE */
>>>   #ifdef CONFIG_MEMORY_HOTPLUG
>>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>>> index 431b1f6753c0..da3982751ba9 100644
>>> --- a/mm/memory_hotplug.c
>>> +++ b/mm/memory_hotplug.c
>>> @@ -1938,6 +1938,22 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
>>>       return 0;
>>>   }
>>> +static nodemask_t offlining_node = NODE_MASK_NONE;
>>> +
>>> +bool is_offlining_node(nodemask_t nodes)
>>> +{
>>> +    return nodes_equal(offlining_node, nodes);
>>> +}
>>> +
>>> +static void offline_pages_start(int node)
>>> +{
>>> +    node_set(node, offlining_node);
>>> +}
>>> +
>>> +static void offline_pages_end(void)
>>> +{
>>> +    offlining_node = NODE_MASK_NONE;
>>> +}
>>>   /*
>>>    * Must be called with mem_hotplug_lock in write mode.
>>>    */
>>> @@ -1991,6 +2007,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>>           goto failed_removal;
>>>       }
>>> +    offline_pages_start(node);
>>>       /*
>>>        * Disable pcplists so that page isolation cannot race with freeing
>>>        * in a way that pages from isolated pageblock are left on pcplists.
>>> @@ -2107,6 +2124,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>>       memory_notify(MEM_OFFLINE, &arg);
>>>       remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
>>> +    offline_pages_end();
>>> +
>>>       return 0;
>>>   failed_removal_isolated:
>>> @@ -2121,6 +2140,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>>>            (unsigned long long) start_pfn << PAGE_SHIFT,
>>>            ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
>>>            reason);
>>> +
>>> +    offline_pages_end();
>>>       return ret;
>>>   }
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 1780df31d5f5..acdab6b114a5 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -3563,6 +3563,12 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
>>>       if (page)
>>>           goto out;
>>> +    /* hot-remove is on-going, it generally fails to allocate memory from
>>> +     * the being removed memory node. Leave it alone.
>>> +     */
>>> +    if (is_offlining_node(*ac->nodemask))
>>> +        goto out;
>>> +
>>>       /* Coredumps can quickly deplete all memory reserves */
>>>       if (current->flags & PF_DUMPCORE)
>>>           goto out;
>>> -- 
>>> 2.29.2
>>>
>>
Michal Hocko July 29, 2024, 6:13 a.m. UTC | #4
On Mon 29-07-24 02:14:13, Zhijian Li (Fujitsu) wrote:
> 
> 
> On 29/07/2024 08:37, Li Zhijian wrote:
> > Michal,
> > 
> > Sorry to the late reply.
> > 
> > 
> > On 26/07/2024 17:17, Michal Hocko wrote:
> >> On Fri 26-07-24 16:44:56, Li Zhijian wrote:
> >>> When a process is bound to a node that is being hot-removed, any memory
> >>> allocation attempts from that node should fail gracefully without
> >>> triggering the OOM-killer. However, the current behavior can cause the
> >>> oom-killer to be invoked, leading to the termination of processes on other
> >>> nodes, even when there is sufficient memory available in the system.
> >>
> >> But you said they are bound to the node that is offlined.
> >>> Prevent the oom-killer from being triggered by processes bound to a
> >>> node undergoing hot-remove operations. Instead, the allocation attempts
> >>> from the offlining node will simply fail, allowing the process to handle
> >>> the failure appropriately without causing disruption to the system.
> >>
> >> NAK.
> >>
> >> Also it is not really clear why process of offlining should behave any
> >> different from after the node is offlined. Could you describe an actual
> >> problem you are facing with much more details please?
> > 
> > We encountered that some processes(including some system critical services, for example sshd, rsyslogd, login)
> > were killed during our memory hot-remove testing. Our test program are described previous mail[1]
> > 
> > In short, we have 3 memory nodes, node0 and node1 are DRAM, while node2 is CXL volatile memory that is onlined
> > to ZONE_MOVABLE. When we attempted to remove the node2, oom-killed was invoked to kill other processes
> > (sshd, rsyslogd, login) even though there is enough memory on node0+node1.

What are sizes of those nodes, how much memory does the testing program
consumes and do you have oom report without the patch applied?
Zhijian Li (Fujitsu) July 29, 2024, 6:34 a.m. UTC | #5
on 7/29/2024 2:13 PM, Michal Hocko wrote:
> On Mon 29-07-24 02:14:13, Zhijian Li (Fujitsu) wrote:
>>
>> On 29/07/2024 08:37, Li Zhijian wrote:
>>> Michal,
>>>
>>> Sorry to the late reply.
>>>
>>>
>>> On 26/07/2024 17:17, Michal Hocko wrote:
>>>> On Fri 26-07-24 16:44:56, Li Zhijian wrote:
>>>>> When a process is bound to a node that is being hot-removed, any memory
>>>>> allocation attempts from that node should fail gracefully without
>>>>> triggering the OOM-killer. However, the current behavior can cause the
>>>>> oom-killer to be invoked, leading to the termination of processes on other
>>>>> nodes, even when there is sufficient memory available in the system.
>>>> But you said they are bound to the node that is offlined.
>>>>> Prevent the oom-killer from being triggered by processes bound to a
>>>>> node undergoing hot-remove operations. Instead, the allocation attempts
>>>>> from the offlining node will simply fail, allowing the process to handle
>>>>> the failure appropriately without causing disruption to the system.
>>>> NAK.
>>>>
>>>> Also it is not really clear why process of offlining should behave any
>>>> different from after the node is offlined. Could you describe an actual
>>>> problem you are facing with much more details please?
>>> We encountered that some processes(including some system critical services, for example sshd, rsyslogd, login)
>>> were killed during our memory hot-remove testing. Our test program are described previous mail[1]
>>>
>>> In short, we have 3 memory nodes, node0 and node1 are DRAM, while node2 is CXL volatile memory that is onlined
>>> to ZONE_MOVABLE. When we attempted to remove the node2, oom-killed was invoked to kill other processes
>>> (sshd, rsyslogd, login) even though there is enough memory on node0+node1.
> What are sizes of those nodes, how much memory does the testing program
> consumes and do you have oom report without the patch applied?
>
node0: 4G, node1: 4G, node2: 2G

my testing program will consume 64M memory. It's running on an *IDEL* 
system.

[root@localhost guest]# numactl -H

available: 3 nodes (0-2)
node 0 cpus: 0 1
node 0 size: 3927 MB
node 0 free: 3449 MB
node 1 cpus: 2 3
node 1 size: 4028 MB
node 1 free: 3614 MB
node 2 cpus:
node 2 size: 2048 MB
node 2 free: 2048 MB
node distances:
node   0   1   2
    0:  10  20  20
    1:  20  10  20
    2:  20  20  10


An oom report is as following:
[13853.707626] consume_std_pag invoked oom-killer: 
gfp_mask=0x140dca(GFP_HIGHUSER_MOVABLE|__GFP_COMP|__GFP_ZERO), order=0, 
oom_score_adj=0
[13853.708400] CPU: 1 PID: 274746 Comm: consume_std_pag Not tainted 
6.10.0-rc2+ #160
[13853.708745] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[13853.709161] Call Trace:
[13853.709161] <TASK>
[13853.709161] dump_stack_lvl+0x64/0x80
[13853.709161] dump_header+0x44/0x1a0
[13853.709161] oom_kill_process+0xf8/0x200
[13853.709161] out_of_memory+0x110/0x590
[13853.709161] __alloc_pages_slowpath.constprop.92+0xb5f/0xd80
[13853.709161] __alloc_pages_noprof+0x354/0x380
[13853.709161] alloc_pages_mpol_noprof+0xe3/0x1f0
[13853.709161] vma_alloc_folio_noprof+0x5c/0xb0
[13853.709161] folio_prealloc+0x21/0x80
[13853.709161] do_pte_missing+0x695/0xa20
[13853.709161] ? __pte_offset_map+0x1b/0x180
[13853.709161] __handle_mm_fault+0x65f/0xc10
[13853.709161] ? sched_tick+0xd7/0x2b0
[13853.709161] handle_mm_fault+0x128/0x360
[13853.709161] do_user_addr_fault+0x309/0x810
[13853.709161] exc_page_fault+0x7e/0x180
[13853.709161] asm_exc_page_fault+0x26/0x30
[13853.709161] RIP: 0033:0x7f1d3ae2428a
[13853.709161] Code: c5 fe 7f 07 c5 fe 7f 47 20 c5 fe 7f 47 40 c5 fe 7f 
47 60 c5 f8 77 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 
fa <f3> aa 48 89 d0 c5 f8 77 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90
[13853.712991] RSP: 002b:00007ffe083a2388 EFLAGS: 00000206
[13853.712991] RAX: 0000000000000000 RBX: 00007ffe083a24d8 RCX: 
0000000007eb8010
[13853.713915] Fallback order for Node 0: 0 1
[13853.713987] Fallback order for Node 1: 1 0
[13853.714006] Fallback order for Node 2: 0 1
[13853.714175] Built 3 zonelists, mobility grouping on. Total pages: 2002419
[13853.712991] RDX: 00007f1d32c00010 RSI: 0000000000000000 RDI: 
00007f1d32d48000
[13853.712991] RBP: 00007ffe083a23b0 R08: 00000000ffffffff R09: 
0000000000000000
[13853.714439] Policy zone: Normal
[13853.712991] R10: 00007f1d3acd5200 R11: 00007f1d3ae241c0 R12: 
0000000000000002
[13853.712991] R13: 0000000000000000 R14: 00007f1d3aeea000 R15: 
0000000000403e00
[13853.712991] </TASK>
[13853.716564] Mem-Info:
[13853.716688] active_anon:17939 inactive_anon:0 isolated_anon:0
[13853.716688] active_file:132347 inactive_file:109560 isolated_file:0
[13853.716688] unevictable:0 dirty:2021 writeback:0
[13853.716688] slab_reclaimable:5876 slab_unreclaimable:18566
[13853.716688] mapped:35589 shmem:251 pagetables:1809
[13853.716688] sec_pagetables:0 bounce:0
[13853.716688] kernel_misc_reclaimable:0
[13853.716688] free:1694176 free_pcp:0 free_cma:0
[13853.718420] Node 2 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=1048576kB
[13853.718730] Node 2 hugepages_total=0 hugepages_free=0 
hugepages_surp=0 hugepages_size=2048kB
[13853.719127] 242158 total pagecache pages
[13853.719310] 0 pages in swap cache
[13853.719441] Free swap = 8142844kB
[13853.719583] Total swap = 8143868kB
[13853.719731] 2097019 pages RAM
[13853.719890] 0 pages HighMem/MovableOnly
[13853.720155] 60814 pages reserved
[13853.720278] 0 pages cma reserved
[13853.720393] 0 pages hwpoisoned
[13853.720494] Tasks state (memory values in pages):
[13853.720686] [ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem 
pgtables_bytes swapents oom_score_adj name
[13853.721214] [ 718] 0 718 40965 29598 256 29342 0 368640 0 -250 
systemd-journal
<...snip...>
[13853.747190] [ 274715] 0 274715 8520 1731 879 852 0 73728 0 0 
(udev-worker)
[13853.747561] [ 274743] 0 274743 617 384 0 384 0 45056 0 0 consume_activit
[13853.748099] [ 274744] 0 274744 617 281 0 281 0 45056 0 0 consume_activit
[13853.748479] [ 274745] 0 274745 2369 954 128 826 0 61440 0 0 daxctl
[13853.748885] [ 274746] 0 274746 33386 667 320 347 0 49152 0 0 
consume_std_pag
<...snip...>
[13853.755653] [ 274808] 0 274808 3534 251 32 219 0 61440 0 0 systemctl
[13853.756151] 
oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=2,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/system.slice/rsyslog.service,task=rsyslogd,pid=274557,uid=0
[13853.756791] Out of memory: Killed process 274557 (rsyslogd) 
total-vm:957964kB, anon-rss:640kB, file-rss:46496kB, shmem-rss:0kB, 
UID:0 pgtables:1512kB oom_score_adj:0
[13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
[13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.758865] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.759319] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.759564] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.759779] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760128] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760361] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760588] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.760794] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.761187] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
[13853.774166] Demotion targets for Node 0: null
[13853.774478] Demotion targets for Node 1: null
Michal Hocko July 29, 2024, 7:40 a.m. UTC | #6
On Mon 29-07-24 06:34:21, Zhijian Li (Fujitsu) wrote:
> 
> 
> on 7/29/2024 2:13 PM, Michal Hocko wrote:
> > On Mon 29-07-24 02:14:13, Zhijian Li (Fujitsu) wrote:
> >>
> >> On 29/07/2024 08:37, Li Zhijian wrote:
> >>> Michal,
> >>>
> >>> Sorry to the late reply.
> >>>
> >>>
> >>> On 26/07/2024 17:17, Michal Hocko wrote:
> >>>> On Fri 26-07-24 16:44:56, Li Zhijian wrote:
> >>>>> When a process is bound to a node that is being hot-removed, any memory
> >>>>> allocation attempts from that node should fail gracefully without
> >>>>> triggering the OOM-killer. However, the current behavior can cause the
> >>>>> oom-killer to be invoked, leading to the termination of processes on other
> >>>>> nodes, even when there is sufficient memory available in the system.
> >>>> But you said they are bound to the node that is offlined.
> >>>>> Prevent the oom-killer from being triggered by processes bound to a
> >>>>> node undergoing hot-remove operations. Instead, the allocation attempts
> >>>>> from the offlining node will simply fail, allowing the process to handle
> >>>>> the failure appropriately without causing disruption to the system.
> >>>> NAK.
> >>>>
> >>>> Also it is not really clear why process of offlining should behave any
> >>>> different from after the node is offlined. Could you describe an actual
> >>>> problem you are facing with much more details please?
> >>> We encountered that some processes(including some system critical services, for example sshd, rsyslogd, login)
> >>> were killed during our memory hot-remove testing. Our test program are described previous mail[1]
> >>>
> >>> In short, we have 3 memory nodes, node0 and node1 are DRAM, while node2 is CXL volatile memory that is onlined
> >>> to ZONE_MOVABLE. When we attempted to remove the node2, oom-killed was invoked to kill other processes
> >>> (sshd, rsyslogd, login) even though there is enough memory on node0+node1.
> > What are sizes of those nodes, how much memory does the testing program
> > consumes and do you have oom report without the patch applied?
> >
> node0: 4G, node1: 4G, node2: 2G
> 
> my testing program will consume 64M memory. It's running on an *IDEL* 
> system.
> 
> [root@localhost guest]# numactl -H
> 
> available: 3 nodes (0-2)
> node 0 cpus: 0 1
> node 0 size: 3927 MB
> node 0 free: 3449 MB
> node 1 cpus: 2 3
> node 1 size: 4028 MB
> node 1 free: 3614 MB
> node 2 cpus:
> node 2 size: 2048 MB
> node 2 free: 2048 MB
> node distances:
> node   0   1   2
>     0:  10  20  20
>     1:  20  10  20
>     2:  20  20  10
> 
> 
> An oom report is as following:
> [13853.707626] consume_std_pag invoked oom-killer: 
> gfp_mask=0x140dca(GFP_HIGHUSER_MOVABLE|__GFP_COMP|__GFP_ZERO), order=0, 
> oom_score_adj=0
> [13853.708400] CPU: 1 PID: 274746 Comm: consume_std_pag Not tainted 
> 6.10.0-rc2+ #160
> [13853.708745] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [13853.709161] Call Trace:
> [13853.709161] <TASK>
> [13853.709161] dump_stack_lvl+0x64/0x80
> [13853.709161] dump_header+0x44/0x1a0
> [13853.709161] oom_kill_process+0xf8/0x200
> [13853.709161] out_of_memory+0x110/0x590
> [13853.709161] __alloc_pages_slowpath.constprop.92+0xb5f/0xd80
> [13853.709161] __alloc_pages_noprof+0x354/0x380
> [13853.709161] alloc_pages_mpol_noprof+0xe3/0x1f0
> [13853.709161] vma_alloc_folio_noprof+0x5c/0xb0
> [13853.709161] folio_prealloc+0x21/0x80
> [13853.709161] do_pte_missing+0x695/0xa20
> [13853.709161] ? __pte_offset_map+0x1b/0x180
> [13853.709161] __handle_mm_fault+0x65f/0xc10
> [13853.709161] ? sched_tick+0xd7/0x2b0
> [13853.709161] handle_mm_fault+0x128/0x360
> [13853.709161] do_user_addr_fault+0x309/0x810
> [13853.709161] exc_page_fault+0x7e/0x180
> [13853.709161] asm_exc_page_fault+0x26/0x30
> [13853.709161] RIP: 0033:0x7f1d3ae2428a
> [13853.709161] Code: c5 fe 7f 07 c5 fe 7f 47 20 c5 fe 7f 47 40 c5 fe 7f 
> 47 60 c5 f8 77 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89 
> fa <f3> aa 48 89 d0 c5 f8 77 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90
> [13853.712991] RSP: 002b:00007ffe083a2388 EFLAGS: 00000206
> [13853.712991] RAX: 0000000000000000 RBX: 00007ffe083a24d8 RCX: 
> 0000000007eb8010
> [13853.713915] Fallback order for Node 0: 0 1
> [13853.713987] Fallback order for Node 1: 1 0
> [13853.714006] Fallback order for Node 2: 0 1
> [13853.714175] Built 3 zonelists, mobility grouping on. Total pages: 2002419
> [13853.712991] RDX: 00007f1d32c00010 RSI: 0000000000000000 RDI: 
> 00007f1d32d48000
> [13853.712991] RBP: 00007ffe083a23b0 R08: 00000000ffffffff R09: 
> 0000000000000000
> [13853.714439] Policy zone: Normal
> [13853.712991] R10: 00007f1d3acd5200 R11: 00007f1d3ae241c0 R12: 
> 0000000000000002
> [13853.712991] R13: 0000000000000000 R14: 00007f1d3aeea000 R15: 
> 0000000000403e00
> [13853.712991] </TASK>
> [13853.716564] Mem-Info:
> [13853.716688] active_anon:17939 inactive_anon:0 isolated_anon:0
> [13853.716688] active_file:132347 inactive_file:109560 isolated_file:0
> [13853.716688] unevictable:0 dirty:2021 writeback:0
> [13853.716688] slab_reclaimable:5876 slab_unreclaimable:18566
> [13853.716688] mapped:35589 shmem:251 pagetables:1809
> [13853.716688] sec_pagetables:0 bounce:0
> [13853.716688] kernel_misc_reclaimable:0
> [13853.716688] free:1694176 free_pcp:0 free_cma:0
> [13853.718420] Node 2 hugepages_total=0 hugepages_free=0 
> hugepages_surp=0 hugepages_size=1048576kB
> [13853.718730] Node 2 hugepages_total=0 hugepages_free=0 
> hugepages_surp=0 hugepages_size=2048kB
> [13853.719127] 242158 total pagecache pages
> [13853.719310] 0 pages in swap cache
> [13853.719441] Free swap = 8142844kB
> [13853.719583] Total swap = 8143868kB
> [13853.719731] 2097019 pages RAM
> [13853.719890] 0 pages HighMem/MovableOnly
> [13853.720155] 60814 pages reserved
> [13853.720278] 0 pages cma reserved
> [13853.720393] 0 pages hwpoisoned
> [13853.720494] Tasks state (memory values in pages):
> [13853.720686] [ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem 
> pgtables_bytes swapents oom_score_adj name
> [13853.721214] [ 718] 0 718 40965 29598 256 29342 0 368640 0 -250 
> systemd-journal
> <...snip...>
> [13853.747190] [ 274715] 0 274715 8520 1731 879 852 0 73728 0 0 
> (udev-worker)
> [13853.747561] [ 274743] 0 274743 617 384 0 384 0 45056 0 0 consume_activit
> [13853.748099] [ 274744] 0 274744 617 281 0 281 0 45056 0 0 consume_activit
> [13853.748479] [ 274745] 0 274745 2369 954 128 826 0 61440 0 0 daxctl
> [13853.748885] [ 274746] 0 274746 33386 667 320 347 0 49152 0 0 
> consume_std_pag
> <...snip...>
> [13853.755653] [ 274808] 0 274808 3534 251 32 219 0 61440 0 0 systemctl
> [13853.756151] 
> oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=2,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/system.slice/rsyslog.service,task=rsyslogd,pid=274557,uid=0
> [13853.756791] Out of memory: Killed process 274557 (rsyslogd) 
> total-vm:957964kB, anon-rss:640kB, file-rss:46496kB, shmem-rss:0kB, 
> UID:0 pgtables:1512kB oom_score_adj:0

OK, I guess I can see what is going on now. You are binding your memory
allocating task to the node you are offlining completely. This obviously
triggers OOM on that node. The oom killer is not really great at
handling memory policy OOMs because it is lacking per-node memory
consumption data for processes. That means that rather than killing the
test program which continues consuming memory - and not much of it - it
keeps killing other tasks with a higher memory consumption.

This is really unfortunate but not something that should be handled by
special casing memory offlining but rather handling the mempolicy OOMs
better. There were some attempts in the past but never made it to a
mergable state. Maybe you want to pick up on that.

> [13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
> [13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF

This shouldn't really happen and it indicates that some memory
allocation in the pagefault path has failed.

> [13853.758865] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.759319] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.759564] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.759779] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.760128] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.760361] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.760588] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.760794] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.761187] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> [13853.774166] Demotion targets for Node 0: null
> [13853.774478] Demotion targets for Node 1: null
>
Zhijian Li (Fujitsu) July 29, 2024, 8:04 a.m. UTC | #7
On 29/07/2024 15:40, Michal Hocko wrote:
> On Mon 29-07-24 06:34:21, Zhijian Li (Fujitsu) wrote:
>>
>>
>> on 7/29/2024 2:13 PM, Michal Hocko wrote:
>>> On Mon 29-07-24 02:14:13, Zhijian Li (Fujitsu) wrote:
>>>>
>>>> On 29/07/2024 08:37, Li Zhijian wrote:
>>>>> Michal,
>>>>>
>>>>> Sorry to the late reply.
>>>>>
>>>>>
>>>>> On 26/07/2024 17:17, Michal Hocko wrote:
>>>>>> On Fri 26-07-24 16:44:56, Li Zhijian wrote:
>>>>>>> When a process is bound to a node that is being hot-removed, any memory
>>>>>>> allocation attempts from that node should fail gracefully without
>>>>>>> triggering the OOM-killer. However, the current behavior can cause the
>>>>>>> oom-killer to be invoked, leading to the termination of processes on other
>>>>>>> nodes, even when there is sufficient memory available in the system.
>>>>>> But you said they are bound to the node that is offlined.
>>>>>>> Prevent the oom-killer from being triggered by processes bound to a
>>>>>>> node undergoing hot-remove operations. Instead, the allocation attempts
>>>>>>> from the offlining node will simply fail, allowing the process to handle
>>>>>>> the failure appropriately without causing disruption to the system.
>>>>>> NAK.
>>>>>>
>>>>>> Also it is not really clear why process of offlining should behave any
>>>>>> different from after the node is offlined. Could you describe an actual
>>>>>> problem you are facing with much more details please?
>>>>> We encountered that some processes(including some system critical services, for example sshd, rsyslogd, login)
>>>>> were killed during our memory hot-remove testing. Our test program are described previous mail[1]
>>>>>
>>>>> In short, we have 3 memory nodes, node0 and node1 are DRAM, while node2 is CXL volatile memory that is onlined
>>>>> to ZONE_MOVABLE. When we attempted to remove the node2, oom-killed was invoked to kill other processes
>>>>> (sshd, rsyslogd, login) even though there is enough memory on node0+node1.
>>> What are sizes of those nodes, how much memory does the testing program
>>> consumes and do you have oom report without the patch applied?
>>>
>> node0: 4G, node1: 4G, node2: 2G
>>
>> my testing program will consume 64M memory. It's running on an *IDEL*
>> system.
>>
>> [root@localhost guest]# numactl -H
>>
>> available: 3 nodes (0-2)
>> node 0 cpus: 0 1
>> node 0 size: 3927 MB
>> node 0 free: 3449 MB
>> node 1 cpus: 2 3
>> node 1 size: 4028 MB
>> node 1 free: 3614 MB
>> node 2 cpus:
>> node 2 size: 2048 MB
>> node 2 free: 2048 MB
>> node distances:
>> node   0   1   2
>>      0:  10  20  20
>>      1:  20  10  20
>>      2:  20  20  10
>>
>>
>> An oom report is as following:
>> [13853.707626] consume_std_pag invoked oom-killer:
>> gfp_mask=0x140dca(GFP_HIGHUSER_MOVABLE|__GFP_COMP|__GFP_ZERO), order=0,
>> oom_score_adj=0
>> [13853.708400] CPU: 1 PID: 274746 Comm: consume_std_pag Not tainted
>> 6.10.0-rc2+ #160
>> [13853.708745] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
>> rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
>> [13853.709161] Call Trace:
>> [13853.709161] <TASK>
>> [13853.709161] dump_stack_lvl+0x64/0x80
>> [13853.709161] dump_header+0x44/0x1a0
>> [13853.709161] oom_kill_process+0xf8/0x200
>> [13853.709161] out_of_memory+0x110/0x590
>> [13853.709161] __alloc_pages_slowpath.constprop.92+0xb5f/0xd80
>> [13853.709161] __alloc_pages_noprof+0x354/0x380
>> [13853.709161] alloc_pages_mpol_noprof+0xe3/0x1f0
>> [13853.709161] vma_alloc_folio_noprof+0x5c/0xb0
>> [13853.709161] folio_prealloc+0x21/0x80
>> [13853.709161] do_pte_missing+0x695/0xa20
>> [13853.709161] ? __pte_offset_map+0x1b/0x180
>> [13853.709161] __handle_mm_fault+0x65f/0xc10
>> [13853.709161] ? sched_tick+0xd7/0x2b0
>> [13853.709161] handle_mm_fault+0x128/0x360
>> [13853.709161] do_user_addr_fault+0x309/0x810
>> [13853.709161] exc_page_fault+0x7e/0x180
>> [13853.709161] asm_exc_page_fault+0x26/0x30
>> [13853.709161] RIP: 0033:0x7f1d3ae2428a
>> [13853.709161] Code: c5 fe 7f 07 c5 fe 7f 47 20 c5 fe 7f 47 40 c5 fe 7f
>> 47 60 c5 f8 77 c3 66 0f 1f 84 00 00 00 00 00 40 0f b6 c6 48 89 d1 48 89
>> fa <f3> aa 48 89 d0 c5 f8 77 c3 66 66 2e 0f 1f 84 00 00 00 00 00 66 90
>> [13853.712991] RSP: 002b:00007ffe083a2388 EFLAGS: 00000206
>> [13853.712991] RAX: 0000000000000000 RBX: 00007ffe083a24d8 RCX:
>> 0000000007eb8010
>> [13853.713915] Fallback order for Node 0: 0 1
>> [13853.713987] Fallback order for Node 1: 1 0
>> [13853.714006] Fallback order for Node 2: 0 1
>> [13853.714175] Built 3 zonelists, mobility grouping on. Total pages: 2002419
>> [13853.712991] RDX: 00007f1d32c00010 RSI: 0000000000000000 RDI:
>> 00007f1d32d48000
>> [13853.712991] RBP: 00007ffe083a23b0 R08: 00000000ffffffff R09:
>> 0000000000000000
>> [13853.714439] Policy zone: Normal
>> [13853.712991] R10: 00007f1d3acd5200 R11: 00007f1d3ae241c0 R12:
>> 0000000000000002
>> [13853.712991] R13: 0000000000000000 R14: 00007f1d3aeea000 R15:
>> 0000000000403e00
>> [13853.712991] </TASK>
>> [13853.716564] Mem-Info:
>> [13853.716688] active_anon:17939 inactive_anon:0 isolated_anon:0
>> [13853.716688] active_file:132347 inactive_file:109560 isolated_file:0
>> [13853.716688] unevictable:0 dirty:2021 writeback:0
>> [13853.716688] slab_reclaimable:5876 slab_unreclaimable:18566
>> [13853.716688] mapped:35589 shmem:251 pagetables:1809
>> [13853.716688] sec_pagetables:0 bounce:0
>> [13853.716688] kernel_misc_reclaimable:0
>> [13853.716688] free:1694176 free_pcp:0 free_cma:0
>> [13853.718420] Node 2 hugepages_total=0 hugepages_free=0
>> hugepages_surp=0 hugepages_size=1048576kB
>> [13853.718730] Node 2 hugepages_total=0 hugepages_free=0
>> hugepages_surp=0 hugepages_size=2048kB
>> [13853.719127] 242158 total pagecache pages
>> [13853.719310] 0 pages in swap cache
>> [13853.719441] Free swap = 8142844kB
>> [13853.719583] Total swap = 8143868kB
>> [13853.719731] 2097019 pages RAM
>> [13853.719890] 0 pages HighMem/MovableOnly
>> [13853.720155] 60814 pages reserved
>> [13853.720278] 0 pages cma reserved
>> [13853.720393] 0 pages hwpoisoned
>> [13853.720494] Tasks state (memory values in pages):
>> [13853.720686] [ pid ] uid tgid total_vm rss rss_anon rss_file rss_shmem
>> pgtables_bytes swapents oom_score_adj name
>> [13853.721214] [ 718] 0 718 40965 29598 256 29342 0 368640 0 -250
>> systemd-journal
>> <...snip...>
>> [13853.747190] [ 274715] 0 274715 8520 1731 879 852 0 73728 0 0
>> (udev-worker)
>> [13853.747561] [ 274743] 0 274743 617 384 0 384 0 45056 0 0 consume_activit
>> [13853.748099] [ 274744] 0 274744 617 281 0 281 0 45056 0 0 consume_activit
>> [13853.748479] [ 274745] 0 274745 2369 954 128 826 0 61440 0 0 daxctl
>> [13853.748885] [ 274746] 0 274746 33386 667 320 347 0 49152 0 0
>> consume_std_pag
>> <...snip...>
>> [13853.755653] [ 274808] 0 274808 3534 251 32 219 0 61440 0 0 systemctl
>> [13853.756151]
>> oom-kill:constraint=CONSTRAINT_MEMORY_POLICY,nodemask=2,cpuset=/,mems_allowed=0-2,global_oom,task_memcg=/system.slice/rsyslog.service,task=rsyslogd,pid=274557,uid=0
>> [13853.756791] Out of memory: Killed process 274557 (rsyslogd)
>> total-vm:957964kB, anon-rss:640kB, file-rss:46496kB, shmem-rss:0kB,
>> UID:0 pgtables:1512kB oom_score_adj:0
> 
> OK, I guess I can see what is going on now. You are binding your memory
> allocating task to the node you are offlining completely. This obviously
> triggers OOM on that node. The oom killer is not really great at
> handling memory policy OOMs because it is lacking per-node memory
> consumption data for processes. 

Yes, that's the situation


> That means that rather than killing the
> test program which continues consuming memory - and not much of it - it
> keeps killing other tasks with a higher memory consumption.

This behavior is not my(administrator) expectation.



> 
> This is really unfortunate but not something that should be handled by
> special casing memory offlining but rather handling the mempolicy OOMs
> better. There were some attempts in the past but never made it to a
> mergable state. Maybe you want to pick up on that.


Well, tell me the previous proposals(mail/url) please if you have the them in hand.
I want to take a look.



> 
>> [13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
>> [13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> 
> This shouldn't really happen and it indicates that some memory
> allocation in the pagefault path has failed.

May I know if this will cause side effect to other processes.


Thanks
Zhijian


> 
>> [13853.758865] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.759319] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.759564] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.759779] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.760128] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.760361] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.760588] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.760794] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.761187] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>> [13853.774166] Demotion targets for Node 0: null
>> [13853.774478] Demotion targets for Node 1: null
>>
>
Michal Hocko July 29, 2024, 8:15 a.m. UTC | #8
On Mon 29-07-24 08:04:19, Zhijian Li (Fujitsu) wrote:
> On 29/07/2024 15:40, Michal Hocko wrote:
> > That means that rather than killing the
> > test program which continues consuming memory - and not much of it - it
> > keeps killing other tasks with a higher memory consumption.
> 
> This behavior is not my(administrator) expectation.

Well, this lack of proper NUMA aware oom killer behavior is there since
decades without many people complaining about that enough to push for a
better implementation. So while this is not great it seems not that many
people are suffering from that.

In general dealing with a complete memory node hotremove while there are
applications with strong numa policies is quite hard to do right and
there will always be a certain level of suffering.
 
> > This is really unfortunate but not something that should be handled by
> > special casing memory offlining but rather handling the mempolicy OOMs
> > better. There were some attempts in the past but never made it to a
> > mergable state. Maybe you want to pick up on that.
> 
> 
> Well, tell me the previous proposals(mail/url) please if you have the them in hand.
> I want to take a look.

https://lore.kernel.org/all/20220708082129.80115-1-ligang.bdlg@bytedance.com/

btw. lore.kernel.org has a great searching engine.

> >> [13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
> >> [13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> > 
> > This shouldn't really happen and it indicates that some memory
> > allocation in the pagefault path has failed.
> 
> May I know if this will cause side effect to other processes.

This eill mean that the #PF handler has failed to allocate memory and
the VM_FAULT_OOM error has unwound all the way up to the exception
handler and that will restart the instruction that has caused the #PF.

In essence, as long as the process triggering this is not killed or the
allocation doesn't suceed it will be looping in the #PF path. This
normally doesn't happen because our allocators do not fail for small
allocation requests.
Zhijian Li (Fujitsu) July 29, 2024, 8:53 a.m. UTC | #9
On 29/07/2024 16:15, Michal Hocko wrote:
> On Mon 29-07-24 08:04:19, Zhijian Li (Fujitsu) wrote:
>> On 29/07/2024 15:40, Michal Hocko wrote:
>>> That means that rather than killing the
>>> test program which continues consuming memory - and not much of it - it
>>> keeps killing other tasks with a higher memory consumption.
>>
>> This behavior is not my(administrator) expectation.
> 
> Well, this lack of proper NUMA aware oom killer behavior is there since
> decades without many people complaining about that enough to push for a
> better implementation. So while this is not great it seems not that many
> people are suffering from that.
> 
> In general dealing with a complete memory node hotremove while there are
> applications with strong numa policies is quite hard to do right and
> there will always be a certain level of suffering.


Thank you very much for your explanation.
Let me rethink it again...




>   
>>> This is really unfortunate but not something that should be handled by
>>> special casing memory offlining but rather handling the mempolicy OOMs
>>> better. There were some attempts in the past but never made it to a
>>> mergable state. Maybe you want to pick up on that.
>>
>>
>> Well, tell me the previous proposals(mail/url) please if you have the them in hand.
>> I want to take a look.
> 
> https://lore.kernel.org/all/20220708082129.80115-1-ligang.bdlg@bytedance.com/
> 
> btw. lore.kernel.org has a great searching engine.

I will take a look later.



> 
>>>> [13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
>>>> [13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
>>>
>>> This shouldn't really happen and it indicates that some memory
>>> allocation in the pagefault path has failed.
>>
>> May I know if this will cause side effect to other processes.
> 
> This eill mean that the #PF handler has failed to allocate memory and
> the VM_FAULT_OOM error has unwound all the way up to the exception
> handler and that will restart the instruction that has caused the #PF.
> > In essence, as long as the process triggering this is not killed or the
> allocation doesn't suceed it will be looping in the #PF path. This
> normally doesn't happen because our allocators do not fail for small
> allocation requests.

Thanks again for your detailed explanation.

I think this is acceptable for the process bound to the being removed node, isn't it?
Michal Hocko July 29, 2024, 9:16 a.m. UTC | #10
On Mon 29-07-24 08:53:11, Zhijian Li (Fujitsu) wrote:
[...]
> >>>> [13853.758192] pagefault_out_of_memory: 4055 callbacks suppressed
> >>>> [13853.758243] Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF
> >>>
> >>> This shouldn't really happen and it indicates that some memory
> >>> allocation in the pagefault path has failed.
> >>
> >> May I know if this will cause side effect to other processes.
> > 
> > This eill mean that the #PF handler has failed to allocate memory and
> > the VM_FAULT_OOM error has unwound all the way up to the exception
> > handler and that will restart the instruction that has caused the #PF.
> > > In essence, as long as the process triggering this is not killed or the
> > allocation doesn't suceed it will be looping in the #PF path. This
> > normally doesn't happen because our allocators do not fail for small
> > allocation requests.
> 
> Thanks again for your detailed explanation.
> 
> I think this is acceptable for the process bound to the being removed node, isn't it?

It shouldn't be happening really. This is a sign that something doesn't
behave properly. E.g. some of the #PF returning VM_FAULT_OOM without
calling into the allocator.
diff mbox series

Patch

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 7a9ff464608d..0ca804215e11 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -332,6 +332,7 @@  extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 extern int remove_memory(u64 start, u64 size);
 extern void __remove_memory(u64 start, u64 size);
 extern int offline_and_remove_memory(u64 start, u64 size);
+bool is_offlining_node(nodemask_t nodes);
 
 #else
 static inline void try_offline_node(int nid) {}
@@ -348,6 +349,11 @@  static inline int remove_memory(u64 start, u64 size)
 }
 
 static inline void __remove_memory(u64 start, u64 size) {}
+
+static inline bool is_offlining_node(nodemask_t nodes)
+{
+	return false;
+}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431b1f6753c0..da3982751ba9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1938,6 +1938,22 @@  static int count_system_ram_pages_cb(unsigned long start_pfn,
 	return 0;
 }
 
+static nodemask_t offlining_node = NODE_MASK_NONE;
+
+bool is_offlining_node(nodemask_t nodes)
+{
+	return nodes_equal(offlining_node, nodes);
+}
+
+static void offline_pages_start(int node)
+{
+	node_set(node, offlining_node);
+}
+
+static void offline_pages_end(void)
+{
+	offlining_node = NODE_MASK_NONE;
+}
 /*
  * Must be called with mem_hotplug_lock in write mode.
  */
@@ -1991,6 +2007,7 @@  int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 		goto failed_removal;
 	}
 
+	offline_pages_start(node);
 	/*
 	 * Disable pcplists so that page isolation cannot race with freeing
 	 * in a way that pages from isolated pageblock are left on pcplists.
@@ -2107,6 +2124,8 @@  int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 
 	memory_notify(MEM_OFFLINE, &arg);
 	remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
+	offline_pages_end();
+
 	return 0;
 
 failed_removal_isolated:
@@ -2121,6 +2140,8 @@  int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
 		 (unsigned long long) start_pfn << PAGE_SHIFT,
 		 ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
 		 reason);
+
+	offline_pages_end();
 	return ret;
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1780df31d5f5..acdab6b114a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3563,6 +3563,12 @@  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	if (page)
 		goto out;
 
+	/* hot-remove is on-going, it generally fails to allocate memory from
+	 * the being removed memory node. Leave it alone.
+	 */
+	if (is_offlining_node(*ac->nodemask))
+		goto out;
+
 	/* Coredumps can quickly deplete all memory reserves */
 	if (current->flags & PF_DUMPCORE)
 		goto out;