Message ID | 20240920221202.1734227-3-kaiyang2@cs.cmu.edu (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | memory tiering fairness by per-cgroup control of promotion and demotion | expand |
Hello, kernel test robot noticed "BUG:kernel_NULL_pointer_dereference,address" on: commit: 6f4c005a5f8b8ff1ce674731545b302af5f28f3f ("[RFC PATCH 2/4] calculate memory.low for the local node and track its usage") url: https://github.com/intel-lab-lkp/linux/commits/kaiyang2-cs-cmu-edu/Add-get_cgroup_local_usage-for-estimating-the-top-tier-memory-usage/20240921-061404 base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything patch link: https://lore.kernel.org/all/20240920221202.1734227-3-kaiyang2@cs.cmu.edu/ patch subject: [RFC PATCH 2/4] calculate memory.low for the local node and track its usage in testcase: boot compiler: gcc-12 test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G (please refer to attached dmesg/kmsg for entire log/backtrace) +---------------------------------------------+------------+------------+ | | 0af685cc17 | 6f4c005a5f | +---------------------------------------------+------------+------------+ | boot_successes | 12 | 0 | | boot_failures | 0 | 12 | | BUG:kernel_NULL_pointer_dereference,address | 0 | 12 | | Oops | 0 | 12 | | RIP:si_meminfo_node | 0 | 12 | | Kernel_panic-not_syncing:Fatal_exception | 0 | 12 | +---------------------------------------------+------------+------------+ If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <oliver.sang@intel.com> | Closes: https://lore.kernel.org/oe-lkp/202409221625.1e974ac-oliver.sang@intel.com [ 14.204830][ T1] BUG: kernel NULL pointer dereference, address: 0000000000000090 [ 14.206729][ T1] #PF: supervisor read access in kernel mode [ 14.208090][ T1] #PF: error_code(0x0000) - not-present page [ 14.209393][ T1] PGD 0 P4D 0 [ 14.210212][ T1] Oops: Oops: 0000 [#1] SMP PTI [ 14.211269][ T1] CPU: 1 UID: 0 PID: 1 Comm: systemd Not tainted 6.11.0-rc6-00570-g6f4c005a5f8b #1 [ 14.213284][ T1] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 14.215290][ T1] RIP: 0010:si_meminfo_node (arch/x86/include/asm/atomic64_64.h:15 (discriminator 3) include/linux/atomic/atomic-arch-fallback.h:2583 (discriminator 3) include/linux/atomic/atomic-long.h:38 (discriminator 3) include/linux/atomic/atomic-instrumented.h:3189 (discriminator 3) include/linux/mmzone.h:1042 (discriminator 3) mm/show_mem.c:98 (discriminator 3)) [ 14.216523][ T1] Code: 90 90 66 0f 1f 00 0f 1f 44 00 00 48 63 c6 55 31 d2 4c 8b 04 c5 c0 a7 fb 8c 53 48 89 c5 48 89 fb 4c 89 c0 49 8d b8 00 1e 00 00 <48> 8b 88 90 00 00 00 48 05 00 06 00 00 48 01 ca 48 39 f8 75 eb 48 All code ======== 0: 90 nop 1: 90 nop 2: 66 0f 1f 00 nopw (%rax) 6: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) b: 48 63 c6 movslq %esi,%rax e: 55 push %rbp f: 31 d2 xor %edx,%edx 11: 4c 8b 04 c5 c0 a7 fb mov -0x73045840(,%rax,8),%r8 18: 8c 19: 53 push %rbx 1a: 48 89 c5 mov %rax,%rbp 1d: 48 89 fb mov %rdi,%rbx 20: 4c 89 c0 mov %r8,%rax 23: 49 8d b8 00 1e 00 00 lea 0x1e00(%r8),%rdi 2a:* 48 8b 88 90 00 00 00 mov 0x90(%rax),%rcx <-- trapping instruction 31: 48 05 00 06 00 00 add $0x600,%rax 37: 48 01 ca add %rcx,%rdx 3a: 48 39 f8 cmp %rdi,%rax 3d: 75 eb jne 0x2a 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 8b 88 90 00 00 00 mov 0x90(%rax),%rcx 7: 48 05 00 06 00 00 add $0x600,%rax d: 48 01 ca add %rcx,%rdx 10: 48 39 f8 cmp %rdi,%rax 13: 75 eb jne 0x0 15: 48 rex.W [ 14.220364][ T1] RSP: 0018:ffffb14b40013d68 EFLAGS: 00010246 [ 14.221717][ T1] RAX: 0000000000000000 RBX: ffffb14b40013d88 RCX: 00000000003a19a2 [ 14.223496][ T1] RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000001e00 [ 14.225170][ T1] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000008 [ 14.226964][ T1] R10: 0000000000000008 R11: 0fffffffffffffff R12: ffffb14b40013d88 [ 14.228774][ T1] R13: 00000000003e7ac3 R14: ffffb14b40013e88 R15: ffff98ab0434f7a0 [ 14.230421][ T1] FS: 00007f9569ae9940(0000) GS:ffff98adefd00000(0000) knlGS:0000000000000000 [ 14.234569][ T1] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 14.235900][ T1] CR2: 0000000000000090 CR3: 0000000100072000 CR4: 00000000000006f0 [ 14.237620][ T1] Call Trace: [ 14.238502][ T1] <TASK> [ 14.239254][ T1] ? __die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434) [ 14.240189][ T1] ? page_fault_oops (arch/x86/mm/fault.c:715) [ 14.241254][ T1] ? exc_page_fault (arch/x86/include/asm/irqflags.h:37 arch/x86/include/asm/irqflags.h:92 arch/x86/mm/fault.c:1489 arch/x86/mm/fault.c:1539) [ 14.242297][ T1] ? asm_exc_page_fault (arch/x86/include/asm/idtentry.h:623) [ 14.243313][ T1] ? si_meminfo_node (arch/x86/include/asm/atomic64_64.h:15 (discriminator 3) include/linux/atomic/atomic-arch-fallback.h:2583 (discriminator 3) include/linux/atomic/atomic-long.h:38 (discriminator 3) include/linux/atomic/atomic-instrumented.h:3189 (discriminator 3) include/linux/mmzone.h:1042 (discriminator 3) mm/show_mem.c:98 (discriminator 3)) [ 14.244443][ T1] ? si_meminfo_node (mm/show_mem.c:114) [ 14.245460][ T1] memory_low_write (mm/memcontrol.c:4088) [ 14.246547][ T1] kernfs_fop_write_iter (fs/kernfs/file.c:338) [ 14.247804][ T1] vfs_write (fs/read_write.c:497 fs/read_write.c:590) [ 14.248830][ T1] ksys_write (fs/read_write.c:643) [ 14.249783][ T1] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) [ 14.250800][ T1] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) [ 14.252260][ T1] RIP: 0033:0x7f956a64b240 [ 14.253276][ T1] Code: 40 00 48 8b 15 c1 9b 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 80 3d a1 23 0e 00 00 74 17 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 48 83 ec 28 48 89 All code ======== 0: 40 00 48 8b add %cl,-0x75(%rax) 4: 15 c1 9b 0d 00 adc $0xd9bc1,%eax 9: f7 d8 neg %eax b: 64 89 02 mov %eax,%fs:(%rdx) e: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax 15: eb b7 jmp 0xffffffffffffffce 17: 0f 1f 00 nopl (%rax) 1a: 80 3d a1 23 0e 00 00 cmpb $0x0,0xe23a1(%rip) # 0xe23c2 21: 74 17 je 0x3a 23: b8 01 00 00 00 mov $0x1,%eax 28: 0f 05 syscall 2a:* 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax <-- trapping instruction 30: 77 58 ja 0x8a 32: c3 retq 33: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 3a: 48 83 ec 28 sub $0x28,%rsp 3e: 48 rex.W 3f: 89 .byte 0x89 Code starting with the faulting instruction =========================================== 0: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax 6: 77 58 ja 0x60 8: c3 retq 9: 0f 1f 80 00 00 00 00 nopl 0x0(%rax) 10: 48 83 ec 28 sub $0x28,%rsp 14: 48 rex.W 15: 89 .byte 0x89 [ 14.257195][ T1] RSP: 002b:00007ffcc66594e8 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 14.259009][ T1] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f956a64b240 [ 14.260848][ T1] RDX: 0000000000000002 RSI: 00007ffcc6659740 RDI: 000000000000001b [ 14.262500][ T1] RBP: 00007ffcc6659740 R08: 0000000000000000 R09: 0000000000000001 [ 14.264147][ T1] R10: 00007f956a6c4820 R11: 0000000000000202 R12: 0000000000000002 [ 14.265934][ T1] R13: 000055fd63872c10 R14: 0000000000000002 R15: 00007f956a7219e0 [ 14.267589][ T1] </TASK> [ 14.268340][ T1] Modules linked in: ip_tables [ 14.269410][ T1] CR2: 0000000000000090 [ 14.270478][ T1] ---[ end trace 0000000000000000 ]--- [ 14.271717][ T1] RIP: 0010:si_meminfo_node (arch/x86/include/asm/atomic64_64.h:15 (discriminator 3) include/linux/atomic/atomic-arch-fallback.h:2583 (discriminator 3) include/linux/atomic/atomic-long.h:38 (discriminator 3) include/linux/atomic/atomic-instrumented.h:3189 (discriminator 3) include/linux/mmzone.h:1042 (discriminator 3) mm/show_mem.c:98 (discriminator 3)) [ 14.272874][ T1] Code: 90 90 66 0f 1f 00 0f 1f 44 00 00 48 63 c6 55 31 d2 4c 8b 04 c5 c0 a7 fb 8c 53 48 89 c5 48 89 fb 4c 89 c0 49 8d b8 00 1e 00 00 <48> 8b 88 90 00 00 00 48 05 00 06 00 00 48 01 ca 48 39 f8 75 eb 48 All code ======== 0: 90 nop 1: 90 nop 2: 66 0f 1f 00 nopw (%rax) 6: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) b: 48 63 c6 movslq %esi,%rax e: 55 push %rbp f: 31 d2 xor %edx,%edx 11: 4c 8b 04 c5 c0 a7 fb mov -0x73045840(,%rax,8),%r8 18: 8c 19: 53 push %rbx 1a: 48 89 c5 mov %rax,%rbp 1d: 48 89 fb mov %rdi,%rbx 20: 4c 89 c0 mov %r8,%rax 23: 49 8d b8 00 1e 00 00 lea 0x1e00(%r8),%rdi 2a:* 48 8b 88 90 00 00 00 mov 0x90(%rax),%rcx <-- trapping instruction 31: 48 05 00 06 00 00 add $0x600,%rax 37: 48 01 ca add %rcx,%rdx 3a: 48 39 f8 cmp %rdi,%rax 3d: 75 eb jne 0x2a 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 8b 88 90 00 00 00 mov 0x90(%rax),%rcx 7: 48 05 00 06 00 00 add $0x600,%rax d: 48 01 ca add %rcx,%rdx 10: 48 39 f8 cmp %rdi,%rax 13: 75 eb jne 0x0 15: 48 rex.W The kernel config and materials to reproduce are available at: https://download.01.org/0day-ci/archive/20240922/202409221625.1e974ac-oliver.sang@intel.com
On Fri, Sep 20, 2024 at 10:11:49PM +0000, kaiyang2@cs.cmu.edu wrote: > From: Kaiyang Zhao <kaiyang2@cs.cmu.edu> > > Add a memory.low for the top-tier node (locallow) and track its usage. > locallow is set by scaling low by the ratio of node 0 capacity and > node 0 + node 1 capacity. > > Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu> > --- > include/linux/page_counter.h | 16 ++++++++--- > mm/hugetlb_cgroup.c | 4 +-- > mm/memcontrol.c | 42 ++++++++++++++++++++++------- > mm/page_counter.c | 52 ++++++++++++++++++++++++++++-------- > 4 files changed, 88 insertions(+), 26 deletions(-) > > diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h > index 79dbd8bc35a7..aa56c93415ef 100644 > --- a/include/linux/page_counter.h > +++ b/include/linux/page_counter.h > @@ -13,6 +13,7 @@ struct page_counter { > * memcg->memory.usage is a hot member of struct mem_cgroup. > */ > atomic_long_t usage; > + struct mem_cgroup *memcg; /* memcg that owns this counter */ Can you make some comments on the lifetime of this new memcg reference? How is it referenced, how is it cleaned up, etc. Probably it's worth added this in a separate patch so it's easier to review the reference tracking. > CACHELINE_PADDING(_pad1_); > > /* effective memory.min and memory.min usage tracking */ > @@ -25,6 +26,10 @@ struct page_counter { > atomic_long_t low_usage; > atomic_long_t children_low_usage; > > + unsigned long elocallow; > + atomic_long_t locallow_usage; per note on other email - probably want local_low_* instead of locallow. > + atomic_long_t children_locallow_usage; > + > unsigned long watermark; > /* Latest cg2 reset watermark */ > unsigned long local_watermark; > @@ -36,6 +41,7 @@ struct page_counter { > bool protection_support; > unsigned long min; > unsigned long low; > + unsigned long locallow; > unsigned long high; > unsigned long max; > struct page_counter *parent; > @@ -52,12 +58,13 @@ struct page_counter { > */ > static inline void page_counter_init(struct page_counter *counter, > struct page_counter *parent, > - bool protection_support) > + bool protection_support, struct mem_cgroup *memcg) > { > counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0); > counter->max = PAGE_COUNTER_MAX; > counter->parent = parent; > counter->protection_support = protection_support; > + counter->memcg = memcg; > } > > static inline unsigned long page_counter_read(struct page_counter *counter) > @@ -72,7 +79,8 @@ bool page_counter_try_charge(struct page_counter *counter, > struct page_counter **fail); > void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); > void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); > -void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); > +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages, > + unsigned long nr_pages_local); > > static inline void page_counter_set_high(struct page_counter *counter, > unsigned long nr_pages) > @@ -99,11 +107,11 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) > #ifdef CONFIG_MEMCG > void page_counter_calculate_protection(struct page_counter *root, > struct page_counter *counter, > - bool recursive_protection); > + bool recursive_protection, int is_local); `bool is_local` is preferred > #else > static inline void page_counter_calculate_protection(struct page_counter *root, > struct page_counter *counter, > - bool recursive_protection) {} > + bool recursive_protection, int is_local) {} > #endif > > #endif /* _LINUX_PAGE_COUNTER_H */ > diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c > index d8d0e665caed..0e07a7a1d5b8 100644 > --- a/mm/hugetlb_cgroup.c > +++ b/mm/hugetlb_cgroup.c > @@ -114,10 +114,10 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, > } > page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, > idx), > - fault_parent, false); > + fault_parent, false, NULL); > page_counter_init( > hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), > - rsvd_parent, false); > + rsvd_parent, false, NULL); > > limit = round_down(PAGE_COUNTER_MAX, > pages_per_huge_page(&hstates[idx])); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 20b715441332..d7c5fff12105 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -1497,6 +1497,9 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) > vm_event_name(memcg_vm_event_stat[i]), > memcg_events(memcg, memcg_vm_event_stat[i])); > } > + > + seq_buf_printf(s, "local_usage %lu\n", > + get_cgroup_local_usage(memcg, true)); > } > > static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) > @@ -3597,8 +3600,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) > if (parent) { > WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); > > - page_counter_init(&memcg->memory, &parent->memory, true); > - page_counter_init(&memcg->swap, &parent->swap, false); > + page_counter_init(&memcg->memory, &parent->memory, true, memcg); > + page_counter_init(&memcg->swap, &parent->swap, false, NULL); > #ifdef CONFIG_MEMCG_V1 > WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); > page_counter_init(&memcg->kmem, &parent->kmem, false); > @@ -3607,8 +3610,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) > } else { > init_memcg_stats(); > init_memcg_events(); > - page_counter_init(&memcg->memory, NULL, true); > - page_counter_init(&memcg->swap, NULL, false); > + page_counter_init(&memcg->memory, NULL, true, memcg); > + page_counter_init(&memcg->swap, NULL, false, NULL); > #ifdef CONFIG_MEMCG_V1 > page_counter_init(&memcg->kmem, NULL, false); > page_counter_init(&memcg->tcpmem, NULL, false); > @@ -3677,7 +3680,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) > memcg1_css_offline(memcg); > > page_counter_set_min(&memcg->memory, 0); > - page_counter_set_low(&memcg->memory, 0); > + page_counter_set_low(&memcg->memory, 0, 0); > > zswap_memcg_offline_cleanup(memcg); > > @@ -3748,7 +3751,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) > page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); > #endif > page_counter_set_min(&memcg->memory, 0); > - page_counter_set_low(&memcg->memory, 0); > + page_counter_set_low(&memcg->memory, 0, 0); > page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); > memcg1_soft_limit_reset(memcg); > page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); > @@ -4051,6 +4054,12 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, > return nbytes; > } > > +static int memory_locallow_show(struct seq_file *m, void *v) > +{ > + return seq_puts_memcg_tunable(m, > + READ_ONCE(mem_cgroup_from_seq(m)->memory.locallow)); > +} > + > static int memory_low_show(struct seq_file *m, void *v) > { > return seq_puts_memcg_tunable(m, > @@ -4061,7 +4070,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, > char *buf, size_t nbytes, loff_t off) > { > struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); > - unsigned long low; > + struct sysinfo si; > + unsigned long low, locallow, local_capacity, total_capacity; > int err; > > buf = strstrip(buf); > @@ -4069,7 +4079,15 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, > if (err) > return err; > > - page_counter_set_low(&memcg->memory, low); > + /* Hardcoded 0 for local node and 1 for remote. */ I know we've talked about this before about this, but this is obviously broken for multi-socket systems. If so, this needs a FIXME or a TODO at least so that it's at least obvious that this patch isn't ready for upstream - even as an RFC. Probably we can't move forward until we figure out how to solve this problem out ahead of this patch set. Worth discussing this issue explicitly. Maybe rather than guessing, a preferred node should be set for local and remote if this mechanism is in use. Otherwise just guessing which local and which remote node seems like it will be wrong - especially for sufficiently large-threaded processes. > + si_meminfo_node(&si, 0); > + local_capacity = si.totalram; /* In pages. */ > + total_capacity = local_capacity; > + si_meminfo_node(&si, 1); > + total_capacity += si.totalram; > + locallow = low * local_capacity / total_capacity; > + > + page_counter_set_low(&memcg->memory, low, locallow); > > return nbytes; > } > @@ -4394,6 +4412,11 @@ static struct cftype memory_files[] = { > .seq_show = memory_low_show, > .write = memory_low_write, > }, > + { > + .name = "locallow", > + .flags = CFTYPE_NOT_ON_ROOT, > + .seq_show = memory_locallow_show, > + }, > { > .name = "high", > .flags = CFTYPE_NOT_ON_ROOT, > @@ -4483,7 +4506,8 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, > if (!root) > root = root_mem_cgroup; > > - page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection); > + page_counter_calculate_protection(&root->memory, &memcg->memory, > + recursive_protection, false); > } > > static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, > diff --git a/mm/page_counter.c b/mm/page_counter.c > index b249d15af9dd..97205aafab46 100644 > --- a/mm/page_counter.c > +++ b/mm/page_counter.c > @@ -18,8 +18,10 @@ static bool track_protection(struct page_counter *c) > return c->protection_support; > } > > +extern unsigned long get_cgroup_local_usage(struct mem_cgroup *memcg, bool flush); > + > static void propagate_protected_usage(struct page_counter *c, > - unsigned long usage) > + unsigned long usage, unsigned long local_usage) > { > unsigned long protected, old_protected; > long delta; > @@ -44,6 +46,15 @@ static void propagate_protected_usage(struct page_counter *c, > if (delta) > atomic_long_add(delta, &c->parent->children_low_usage); > } > + > + protected = min(local_usage, READ_ONCE(c->locallow)); > + old_protected = atomic_long_read(&c->locallow_usage); > + if (protected != old_protected) { > + old_protected = atomic_long_xchg(&c->locallow_usage, protected); > + delta = protected - old_protected; > + if (delta) > + atomic_long_add(delta, &c->parent->children_locallow_usage); > + } > } > > /** > @@ -63,7 +74,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) > atomic_long_set(&counter->usage, new); > } > if (track_protection(counter)) > - propagate_protected_usage(counter, new); > + propagate_protected_usage(counter, new, > + get_cgroup_local_usage(counter->memcg, false)); > } > > /** > @@ -83,7 +95,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) > > new = atomic_long_add_return(nr_pages, &c->usage); > if (protection) > - propagate_protected_usage(c, new); > + propagate_protected_usage(c, new, > + get_cgroup_local_usage(counter->memcg, false)); > /* > * This is indeed racy, but we can live with some > * inaccuracy in the watermark. > @@ -151,7 +164,8 @@ bool page_counter_try_charge(struct page_counter *counter, > goto failed; > } > if (protection) > - propagate_protected_usage(c, new); > + propagate_protected_usage(c, new, > + get_cgroup_local_usage(counter->memcg, false)); > > /* see comment on page_counter_charge */ > if (new > READ_ONCE(c->local_watermark)) { > @@ -238,7 +252,8 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) > WRITE_ONCE(counter->min, nr_pages); > > for (c = counter; c; c = c->parent) > - propagate_protected_usage(c, atomic_long_read(&c->usage)); > + propagate_protected_usage(c, atomic_long_read(&c->usage), > + get_cgroup_local_usage(counter->memcg, false)); > } > > /** > @@ -248,14 +263,17 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) > * > * The caller must serialize invocations on the same counter. > */ > -void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) > +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages, > + unsigned long nr_pages_local) > { > struct page_counter *c; > > WRITE_ONCE(counter->low, nr_pages); > + WRITE_ONCE(counter->locallow, nr_pages_local); > > for (c = counter; c; c = c->parent) > - propagate_protected_usage(c, atomic_long_read(&c->usage)); > + propagate_protected_usage(c, atomic_long_read(&c->usage), > + get_cgroup_local_usage(counter->memcg, false)); > } > > /** > @@ -421,9 +439,9 @@ static unsigned long effective_protection(unsigned long usage, > */ > void page_counter_calculate_protection(struct page_counter *root, > struct page_counter *counter, > - bool recursive_protection) > + bool recursive_protection, int is_local) > { > - unsigned long usage, parent_usage; > + unsigned long usage, parent_usage, local_usage, parent_local_usage; > struct page_counter *parent = counter->parent; > > /* > @@ -437,16 +455,19 @@ void page_counter_calculate_protection(struct page_counter *root, > return; > > usage = page_counter_read(counter); > - if (!usage) > + local_usage = get_cgroup_local_usage(counter->memcg, true); > + if (!usage || !local_usage) > return; > > if (parent == root) { > counter->emin = READ_ONCE(counter->min); > counter->elow = READ_ONCE(counter->low); > + counter->elocallow = READ_ONCE(counter->locallow); > return; > } > > parent_usage = page_counter_read(parent); > + parent_local_usage = get_cgroup_local_usage(parent->memcg, true); > > WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, > READ_ONCE(counter->min), > @@ -454,7 +475,16 @@ void page_counter_calculate_protection(struct page_counter *root, > atomic_long_read(&parent->children_min_usage), > recursive_protection)); > > - WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, > + if (is_local) > + WRITE_ONCE(counter->elocallow, > + effective_protection(local_usage, parent_local_usage, > + READ_ONCE(counter->locallow), > + READ_ONCE(parent->elocallow), > + atomic_long_read(&parent->children_locallow_usage), > + recursive_protection)); > + else > + WRITE_ONCE(counter->elow, > + effective_protection(usage, parent_usage, > READ_ONCE(counter->low), > READ_ONCE(parent->elow), > atomic_long_read(&parent->children_low_usage), > -- > 2.43.0 >
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 79dbd8bc35a7..aa56c93415ef 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -13,6 +13,7 @@ struct page_counter { * memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; + struct mem_cgroup *memcg; /* memcg that owns this counter */ CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ @@ -25,6 +26,10 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; + unsigned long elocallow; + atomic_long_t locallow_usage; + atomic_long_t children_locallow_usage; + unsigned long watermark; /* Latest cg2 reset watermark */ unsigned long local_watermark; @@ -36,6 +41,7 @@ struct page_counter { bool protection_support; unsigned long min; unsigned long low; + unsigned long locallow; unsigned long high; unsigned long max; struct page_counter *parent; @@ -52,12 +58,13 @@ struct page_counter { */ static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent, - bool protection_support) + bool protection_support, struct mem_cgroup *memcg) { counter->usage = (atomic_long_t)ATOMIC_LONG_INIT(0); counter->max = PAGE_COUNTER_MAX; counter->parent = parent; counter->protection_support = protection_support; + counter->memcg = memcg; } static inline unsigned long page_counter_read(struct page_counter *counter) @@ -72,7 +79,8 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); -void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages, + unsigned long nr_pages_local); static inline void page_counter_set_high(struct page_counter *counter, unsigned long nr_pages) @@ -99,11 +107,11 @@ static inline void page_counter_reset_watermark(struct page_counter *counter) #ifdef CONFIG_MEMCG void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, - bool recursive_protection); + bool recursive_protection, int is_local); #else static inline void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, - bool recursive_protection) {} + bool recursive_protection, int is_local) {} #endif #endif /* _LINUX_PAGE_COUNTER_H */ diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d8d0e665caed..0e07a7a1d5b8 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -114,10 +114,10 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), - fault_parent, false); + fault_parent, false, NULL); page_counter_init( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), - rsvd_parent, false); + rsvd_parent, false, NULL); limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20b715441332..d7c5fff12105 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1497,6 +1497,9 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); } + + seq_buf_printf(s, "local_usage %lu\n", + get_cgroup_local_usage(memcg, true)); } static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) @@ -3597,8 +3600,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); - page_counter_init(&memcg->memory, &parent->memory, true); - page_counter_init(&memcg->swap, &parent->swap, false); + page_counter_init(&memcg->memory, &parent->memory, true, memcg); + page_counter_init(&memcg->swap, &parent->swap, false, NULL); #ifdef CONFIG_MEMCG_V1 WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); page_counter_init(&memcg->kmem, &parent->kmem, false); @@ -3607,8 +3610,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) } else { init_memcg_stats(); init_memcg_events(); - page_counter_init(&memcg->memory, NULL, true); - page_counter_init(&memcg->swap, NULL, false); + page_counter_init(&memcg->memory, NULL, true, memcg); + page_counter_init(&memcg->swap, NULL, false, NULL); #ifdef CONFIG_MEMCG_V1 page_counter_init(&memcg->kmem, NULL, false); page_counter_init(&memcg->tcpmem, NULL, false); @@ -3677,7 +3680,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg1_css_offline(memcg); page_counter_set_min(&memcg->memory, 0); - page_counter_set_low(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0, 0); zswap_memcg_offline_cleanup(memcg); @@ -3748,7 +3751,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); #endif page_counter_set_min(&memcg->memory, 0); - page_counter_set_low(&memcg->memory, 0); + page_counter_set_low(&memcg->memory, 0, 0); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg1_soft_limit_reset(memcg); page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); @@ -4051,6 +4054,12 @@ static ssize_t memory_min_write(struct kernfs_open_file *of, return nbytes; } +static int memory_locallow_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.locallow)); +} + static int memory_low_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -4061,7 +4070,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned long low; + struct sysinfo si; + unsigned long low, locallow, local_capacity, total_capacity; int err; buf = strstrip(buf); @@ -4069,7 +4079,15 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - page_counter_set_low(&memcg->memory, low); + /* Hardcoded 0 for local node and 1 for remote. */ + si_meminfo_node(&si, 0); + local_capacity = si.totalram; /* In pages. */ + total_capacity = local_capacity; + si_meminfo_node(&si, 1); + total_capacity += si.totalram; + locallow = low * local_capacity / total_capacity; + + page_counter_set_low(&memcg->memory, low, locallow); return nbytes; } @@ -4394,6 +4412,11 @@ static struct cftype memory_files[] = { .seq_show = memory_low_show, .write = memory_low_write, }, + { + .name = "locallow", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_locallow_show, + }, { .name = "high", .flags = CFTYPE_NOT_ON_ROOT, @@ -4483,7 +4506,8 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, if (!root) root = root_mem_cgroup; - page_counter_calculate_protection(&root->memory, &memcg->memory, recursive_protection); + page_counter_calculate_protection(&root->memory, &memcg->memory, + recursive_protection, false); } static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, diff --git a/mm/page_counter.c b/mm/page_counter.c index b249d15af9dd..97205aafab46 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -18,8 +18,10 @@ static bool track_protection(struct page_counter *c) return c->protection_support; } +extern unsigned long get_cgroup_local_usage(struct mem_cgroup *memcg, bool flush); + static void propagate_protected_usage(struct page_counter *c, - unsigned long usage) + unsigned long usage, unsigned long local_usage) { unsigned long protected, old_protected; long delta; @@ -44,6 +46,15 @@ static void propagate_protected_usage(struct page_counter *c, if (delta) atomic_long_add(delta, &c->parent->children_low_usage); } + + protected = min(local_usage, READ_ONCE(c->locallow)); + old_protected = atomic_long_read(&c->locallow_usage); + if (protected != old_protected) { + old_protected = atomic_long_xchg(&c->locallow_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_locallow_usage); + } } /** @@ -63,7 +74,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) atomic_long_set(&counter->usage, new); } if (track_protection(counter)) - propagate_protected_usage(counter, new); + propagate_protected_usage(counter, new, + get_cgroup_local_usage(counter->memcg, false)); } /** @@ -83,7 +95,8 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) new = atomic_long_add_return(nr_pages, &c->usage); if (protection) - propagate_protected_usage(c, new); + propagate_protected_usage(c, new, + get_cgroup_local_usage(counter->memcg, false)); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -151,7 +164,8 @@ bool page_counter_try_charge(struct page_counter *counter, goto failed; } if (protection) - propagate_protected_usage(c, new); + propagate_protected_usage(c, new, + get_cgroup_local_usage(counter->memcg, false)); /* see comment on page_counter_charge */ if (new > READ_ONCE(c->local_watermark)) { @@ -238,7 +252,8 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) WRITE_ONCE(counter->min, nr_pages); for (c = counter; c; c = c->parent) - propagate_protected_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage), + get_cgroup_local_usage(counter->memcg, false)); } /** @@ -248,14 +263,17 @@ void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) * * The caller must serialize invocations on the same counter. */ -void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages, + unsigned long nr_pages_local) { struct page_counter *c; WRITE_ONCE(counter->low, nr_pages); + WRITE_ONCE(counter->locallow, nr_pages_local); for (c = counter; c; c = c->parent) - propagate_protected_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage), + get_cgroup_local_usage(counter->memcg, false)); } /** @@ -421,9 +439,9 @@ static unsigned long effective_protection(unsigned long usage, */ void page_counter_calculate_protection(struct page_counter *root, struct page_counter *counter, - bool recursive_protection) + bool recursive_protection, int is_local) { - unsigned long usage, parent_usage; + unsigned long usage, parent_usage, local_usage, parent_local_usage; struct page_counter *parent = counter->parent; /* @@ -437,16 +455,19 @@ void page_counter_calculate_protection(struct page_counter *root, return; usage = page_counter_read(counter); - if (!usage) + local_usage = get_cgroup_local_usage(counter->memcg, true); + if (!usage || !local_usage) return; if (parent == root) { counter->emin = READ_ONCE(counter->min); counter->elow = READ_ONCE(counter->low); + counter->elocallow = READ_ONCE(counter->locallow); return; } parent_usage = page_counter_read(parent); + parent_local_usage = get_cgroup_local_usage(parent->memcg, true); WRITE_ONCE(counter->emin, effective_protection(usage, parent_usage, READ_ONCE(counter->min), @@ -454,7 +475,16 @@ void page_counter_calculate_protection(struct page_counter *root, atomic_long_read(&parent->children_min_usage), recursive_protection)); - WRITE_ONCE(counter->elow, effective_protection(usage, parent_usage, + if (is_local) + WRITE_ONCE(counter->elocallow, + effective_protection(local_usage, parent_local_usage, + READ_ONCE(counter->locallow), + READ_ONCE(parent->elocallow), + atomic_long_read(&parent->children_locallow_usage), + recursive_protection)); + else + WRITE_ONCE(counter->elow, + effective_protection(usage, parent_usage, READ_ONCE(counter->low), READ_ONCE(parent->elow), atomic_long_read(&parent->children_low_usage),