diff mbox series

[bpf-next,v3,4/6] memcg: Use trylock to access memcg stock_lock.

Message ID 20241218030720.1602449-5-alexei.starovoitov@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf, mm: Introduce try_alloc_pages() | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 1 this patch: 2
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers warning 4 maintainers not CCed: roman.gushchin@linux.dev cgroups@vger.kernel.org muchun.song@linux.dev mhocko@kernel.org
netdev/build_clang fail Errors and warnings before: 96 this patch: 68
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 84 this patch: 31
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 44 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc fail Errors and warnings before: 16 this patch: 17
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-18 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-19 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-46 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc

Commit Message

Alexei Starovoitov Dec. 18, 2024, 3:07 a.m. UTC
From: Alexei Starovoitov <ast@kernel.org>

Teach memcg to operate under trylock conditions when
spinning locks cannot be used.
The end result is __memcg_kmem_charge_page() and
__memcg_kmem_uncharge_page() are safe to use from
any context in RT and !RT.
In !RT the NMI handler may fail to trylock stock_lock.
In RT hard IRQ and NMI handlers will not attempt to trylock.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 mm/memcontrol.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

Comments

Michal Hocko Dec. 18, 2024, 11:32 a.m. UTC | #1
On Tue 17-12-24 19:07:17, alexei.starovoitov@gmail.com wrote:
> From: Alexei Starovoitov <ast@kernel.org>
> 
> Teach memcg to operate under trylock conditions when
> spinning locks cannot be used.

Can we make this trylock unconditional? I hope I am not really missing
anything important but if the local_lock is just IRQ disabling on !RT.
For RT this is more involved but does it make sense to spin/sleep on the
cache if we can go ahead and charge directly to counters? I mean doesn't
this defeat the purpose of the cache in the first place?
Alexei Starovoitov Dec. 19, 2024, 1:53 a.m. UTC | #2
On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Tue 17-12-24 19:07:17, alexei.starovoitov@gmail.com wrote:
> > From: Alexei Starovoitov <ast@kernel.org>
> >
> > Teach memcg to operate under trylock conditions when
> > spinning locks cannot be used.
>
> Can we make this trylock unconditional? I hope I am not really missing
> anything important but if the local_lock is just IRQ disabling on !RT.
> For RT this is more involved but does it make sense to spin/sleep on the
> cache if we can go ahead and charge directly to counters? I mean doesn't
> this defeat the purpose of the cache in the first place?

memcg folks please correct me.
My understanding is that memcg_stock is a batching mechanism.
Not really a cache. If we keep charging the counters directly
the performance will suffer due to atomic operations and hierarchy walk.
Hence I had to make sure consume_stock() is functioning as designed
and fallback when unlucky.
In !RT case the unlucky part can only happen in_nmi which should be
very rare.
In RT the unlucky part is in_hardirq or in_nmi, since spin_trylock
doesn't really work in those conditions as Steven explained.
Sebastian mentioned that he's working on removing preempt_disable()
from tracepoints. That should help bpf greatly too.
Michal Hocko Dec. 19, 2024, 7:08 a.m. UTC | #3
On Wed 18-12-24 17:53:50, Alexei Starovoitov wrote:
> On Wed, Dec 18, 2024 at 3:32 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Tue 17-12-24 19:07:17, alexei.starovoitov@gmail.com wrote:
> > > From: Alexei Starovoitov <ast@kernel.org>
> > >
> > > Teach memcg to operate under trylock conditions when
> > > spinning locks cannot be used.
> >
> > Can we make this trylock unconditional? I hope I am not really missing
> > anything important but if the local_lock is just IRQ disabling on !RT.
> > For RT this is more involved but does it make sense to spin/sleep on the
> > cache if we can go ahead and charge directly to counters? I mean doesn't
> > this defeat the purpose of the cache in the first place?
> 
> memcg folks please correct me.
> My understanding is that memcg_stock is a batching mechanism.

Yes, it is an optimization to avoid charging the page_counter directly
which involves atomic operations and that scales with the depth of the
hierarchy. So yes, it is a batching mechanism to optimize a common case
where the same memcg charges on the same cpu several allocations which
is a common case.

> Not really a cache. If we keep charging the counters directly
> the performance will suffer due to atomic operations and hierarchy walk.
> Hence I had to make sure consume_stock() is functioning as designed
> and fallback when unlucky.
> In !RT case the unlucky part can only happen in_nmi which should be
> very rare.

Right

> In RT the unlucky part is in_hardirq or in_nmi, since spin_trylock
> doesn't really work in those conditions as Steven explained.

Right

All that being said, the message I wanted to get through is that atomic
(NOWAIT) charges could be trully reentrant if the stock local lock uses
trylock. We do not need a dedicated gfp flag for that now.
Michal Hocko Dec. 19, 2024, 7:27 a.m. UTC | #4
On Thu 19-12-24 08:08:44, Michal Hocko wrote:
> All that being said, the message I wanted to get through is that atomic
> (NOWAIT) charges could be trully reentrant if the stock local lock uses
> trylock. We do not need a dedicated gfp flag for that now.

And I want to add. Not only we can achieve that, I also think this is
desirable because for !RT this will be no functional change and for RT
it makes more sense to simply do deterministic (albeit more costly
page_counter update) than spin over a lock to use the batch (or learn
the batch cannot be used).
Michal Hocko Dec. 19, 2024, 7:52 a.m. UTC | #5
On Thu 19-12-24 08:27:06, Michal Hocko wrote:
> On Thu 19-12-24 08:08:44, Michal Hocko wrote:
> > All that being said, the message I wanted to get through is that atomic
> > (NOWAIT) charges could be trully reentrant if the stock local lock uses
> > trylock. We do not need a dedicated gfp flag for that now.
> 
> And I want to add. Not only we can achieve that, I also think this is
> desirable because for !RT this will be no functional change and for RT
> it makes more sense to simply do deterministic (albeit more costly
> page_counter update) than spin over a lock to use the batch (or learn
> the batch cannot be used).

So effectively this on top of yours
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f168d223375f..29a831f6109c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
 		return ret;
 
 	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
-		if (gfp_mask & __GFP_TRYLOCK)
+		if (!gfpflags_allow_blockingk(gfp_mask))
 			return ret;
 		local_lock_irqsave(&memcg_stock.stock_lock, flags);
 	}
@@ -2211,6 +2211,9 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	if (consume_stock(memcg, nr_pages, gfp_mask))
 		return 0;
 
+	if (!gfpflags_allow_blockingk(gfp_mask))
+		batch = nr_pages;
+
 	if (!do_memsw_account() ||
 	    page_counter_try_charge(&memcg->memsw, batch, &counter)) {
 		if (page_counter_try_charge(&memcg->memory, batch, &counter))
Alexei Starovoitov Dec. 20, 2024, 12:39 a.m. UTC | #6
On Wed, Dec 18, 2024 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 19-12-24 08:27:06, Michal Hocko wrote:
> > On Thu 19-12-24 08:08:44, Michal Hocko wrote:
> > > All that being said, the message I wanted to get through is that atomic
> > > (NOWAIT) charges could be trully reentrant if the stock local lock uses
> > > trylock. We do not need a dedicated gfp flag for that now.
> >
> > And I want to add. Not only we can achieve that, I also think this is
> > desirable because for !RT this will be no functional change and for RT
> > it makes more sense to simply do deterministic (albeit more costly
> > page_counter update) than spin over a lock to use the batch (or learn
> > the batch cannot be used).
>
> So effectively this on top of yours
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f168d223375f..29a831f6109c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
>                 return ret;
>
>         if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> -               if (gfp_mask & __GFP_TRYLOCK)
> +               if (!gfpflags_allow_blockingk(gfp_mask))
>                         return ret;
>                 local_lock_irqsave(&memcg_stock.stock_lock, flags);

I don't quite understand such a strong desire to avoid the new GFP flag
especially when it's in mm/internal.h. There are lots of bits left.
It's not like PF_* flags that are limited, but fine
let's try to avoid GFP_TRYLOCK_BIT.

You're correct that in !RT the above will work, but in RT
spin_trylock vs spin_lock might cause spurious direct page_counter
charge instead of batching. It's still correct and unlikely to
cause performance issues, so probably fine, but in other
places like slub.c gfpflags_allow_blocking() is too coarse.
All of GFP_NOWAIT will fall into such 'trylock' category,
more slub bits will be trylock-ing and potentially returning ENOMEM
for existing GPF_NOWAIT users which is not great.

I think we can do better, though it's a bit odd to indicate
trylock gfp mode by _absence_ of flags instead of presence
of __GFP_TRYLOCK bit.

How about the following:

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ff9060af6295..f06131d5234f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -39,6 +39,17 @@ static inline bool gfpflags_allow_blocking(const
gfp_t gfp_flags)
        return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
 }

+static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
+{
+       /*
+        * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
+        * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
+        * All GFP_* flags including GFP_NOWAIT use one or both flags.
+        * try_alloc_pages() is the only API that doesn't specify either flag.
+        */
+       return !(gfp_flags & __GFP_RECLAIM);
+}
+
 #ifdef CONFIG_HIGHMEM
 #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
 #else
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f168d223375f..545d345c22de 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup
*memcg, unsigned int nr_pages,
                return ret;

        if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
-               if (gfp_mask & __GFP_TRYLOCK)
+               if (!gfpflags_allow_spinning(gfp_mask))
                        return ret;
                local_lock_irqsave(&memcg_stock.stock_lock, flags);
        }

If that's acceptable then such an approach will work for
my slub.c reentrance changes too.
GPF_NOWAIT users will not be affected.
The slub's trylock mode will be only for my upcoming try_kmalloc()
that won't use either gfp_*_reclaim flag.
Michal Hocko Dec. 20, 2024, 8:24 a.m. UTC | #7
On Thu 19-12-24 16:39:43, Alexei Starovoitov wrote:
> On Wed, Dec 18, 2024 at 11:52 PM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Thu 19-12-24 08:27:06, Michal Hocko wrote:
> > > On Thu 19-12-24 08:08:44, Michal Hocko wrote:
> > > > All that being said, the message I wanted to get through is that atomic
> > > > (NOWAIT) charges could be trully reentrant if the stock local lock uses
> > > > trylock. We do not need a dedicated gfp flag for that now.
> > >
> > > And I want to add. Not only we can achieve that, I also think this is
> > > desirable because for !RT this will be no functional change and for RT
> > > it makes more sense to simply do deterministic (albeit more costly
> > > page_counter update) than spin over a lock to use the batch (or learn
> > > the batch cannot be used).
> >
> > So effectively this on top of yours
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index f168d223375f..29a831f6109c 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
> >                 return ret;
> >
> >         if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > -               if (gfp_mask & __GFP_TRYLOCK)
> > +               if (!gfpflags_allow_blockingk(gfp_mask))
> >                         return ret;
> >                 local_lock_irqsave(&memcg_stock.stock_lock, flags);
> 
> I don't quite understand such a strong desire to avoid the new GFP flag
> especially when it's in mm/internal.h. There are lots of bits left.
> It's not like PF_* flags that are limited, but fine
> let's try to avoid GFP_TRYLOCK_BIT.

Because historically this has proven to be a bad idea that usually
backfires.  As I've said in other email I do care much less now that
this is mostly internal (one can still do that but would need to try
hard). But still if we _can_ avoid it and it makes the code generally
_sensible_ then let's not introduce a new flag.

[...]
> How about the following:
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index ff9060af6295..f06131d5234f 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -39,6 +39,17 @@ static inline bool gfpflags_allow_blocking(const
> gfp_t gfp_flags)
>         return !!(gfp_flags & __GFP_DIRECT_RECLAIM);
>  }
> 
> +static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
> +{
> +       /*
> +        * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
> +        * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
> +        * All GFP_* flags including GFP_NOWAIT use one or both flags.
> +        * try_alloc_pages() is the only API that doesn't specify either flag.

I wouldn't be surprised if we had other allocations like that. git grep
is generally not very helpful as many/most allocations use gfp argument
of a sort. I would slightly reword this to be more explicit.
	  /*
	   * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
	   * those are guaranteed to never block on a sleeping lock.
	   * Here we are enforcing that the allaaction doesn't ever spin
	   * on any locks (i.e. only trylocks). There is no highlevel
	   * GFP_$FOO flag for this use try_alloc_pages as the
	   * regular page allocator doesn't fully support this
	   * allocation mode.
> +        */
> +       return !(gfp_flags & __GFP_RECLAIM);
> +}
> +
>  #ifdef CONFIG_HIGHMEM
>  #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
>  #else
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index f168d223375f..545d345c22de 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup
> *memcg, unsigned int nr_pages,
>                 return ret;
> 
>         if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> -               if (gfp_mask & __GFP_TRYLOCK)
> +               if (!gfpflags_allow_spinning(gfp_mask))
>                         return ret;
>                 local_lock_irqsave(&memcg_stock.stock_lock, flags);
>         }
> 
> If that's acceptable then such an approach will work for
> my slub.c reentrance changes too.

It certainly is acceptable for me. Do not forget to add another hunk to
avoid charging the full batch in this case.

Thanks!
Alexei Starovoitov Dec. 20, 2024, 4:10 p.m. UTC | #8
On Fri, Dec 20, 2024 at 12:24 AM Michal Hocko <mhocko@suse.com> wrote:
>
> > +static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
> > +{
> > +       /*
> > +        * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
> > +        * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
> > +        * All GFP_* flags including GFP_NOWAIT use one or both flags.
> > +        * try_alloc_pages() is the only API that doesn't specify either flag.
>
> I wouldn't be surprised if we had other allocations like that. git grep
> is generally not very helpful as many/most allocations use gfp argument
> of a sort. I would slightly reword this to be more explicit.
>           /*
>            * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
>            * those are guaranteed to never block on a sleeping lock.
>            * Here we are enforcing that the allaaction doesn't ever spin
>            * on any locks (i.e. only trylocks). There is no highlevel
>            * GFP_$FOO flag for this use try_alloc_pages as the
>            * regular page allocator doesn't fully support this
>            * allocation mode.

Makes sense. I like this new wording. Will incorporate.

> > +        */
> > +       return !(gfp_flags & __GFP_RECLAIM);
> > +}
> > +
> >  #ifdef CONFIG_HIGHMEM
> >  #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
> >  #else
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index f168d223375f..545d345c22de 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup
> > *memcg, unsigned int nr_pages,
> >                 return ret;
> >
> >         if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > -               if (gfp_mask & __GFP_TRYLOCK)
> > +               if (!gfpflags_allow_spinning(gfp_mask))
> >                         return ret;
> >                 local_lock_irqsave(&memcg_stock.stock_lock, flags);
> >         }
> >
> > If that's acceptable then such an approach will work for
> > my slub.c reentrance changes too.
>
> It certainly is acceptable for me.

Great.

> Do not forget to add another hunk to
> avoid charging the full batch in this case.

Well. It looks like you spotted the existing bug ?

Instead of
+       if (!gfpflags_allow_blockingk(gfp_mask))
+               batch = nr_pages;

it should be unconditional:

+               batch = nr_pages;

after consume_stock() returns false.

Consider:
        stock_pages = READ_ONCE(stock->nr_pages);
        if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {

stock_pages == 10
nr_pages == 20

so after consume_stock() returns false
the batch will stay == MEMCG_CHARGE_BATCH == 64
and
page_counter_try_charge(&memcg->memsw, batch,...

will charge too much ?

and the bug was there for a long time.

Johaness,

looks like it's mostly your code?

Pls help us out.
Shakeel Butt Dec. 20, 2024, 7:45 p.m. UTC | #9
On Fri, Dec 20, 2024 at 08:10:47AM -0800, Alexei Starovoitov wrote:
> On Fri, Dec 20, 2024 at 12:24 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > > +static inline bool gfpflags_allow_spinning(const gfp_t gfp_flags)
> > > +{
> > > +       /*
> > > +        * !__GFP_DIRECT_RECLAIM -> direct claim is not allowed.
> > > +        * !__GFP_KSWAPD_RECLAIM -> it's not safe to wake up kswapd.
> > > +        * All GFP_* flags including GFP_NOWAIT use one or both flags.
> > > +        * try_alloc_pages() is the only API that doesn't specify either flag.
> >
> > I wouldn't be surprised if we had other allocations like that. git grep
> > is generally not very helpful as many/most allocations use gfp argument
> > of a sort. I would slightly reword this to be more explicit.
> >           /*
> >            * This is stronger than GFP_NOWAIT or GFP_ATOMIC because
> >            * those are guaranteed to never block on a sleeping lock.
> >            * Here we are enforcing that the allaaction doesn't ever spin
> >            * on any locks (i.e. only trylocks). There is no highlevel
> >            * GFP_$FOO flag for this use try_alloc_pages as the
> >            * regular page allocator doesn't fully support this
> >            * allocation mode.
> 
> Makes sense. I like this new wording. Will incorporate.
> 
> > > +        */
> > > +       return !(gfp_flags & __GFP_RECLAIM);
> > > +}
> > > +
> > >  #ifdef CONFIG_HIGHMEM
> > >  #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM
> > >  #else
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index f168d223375f..545d345c22de 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -1768,7 +1768,7 @@ static bool consume_stock(struct mem_cgroup
> > > *memcg, unsigned int nr_pages,
> > >                 return ret;
> > >
> > >         if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
> > > -               if (gfp_mask & __GFP_TRYLOCK)
> > > +               if (!gfpflags_allow_spinning(gfp_mask))
> > >                         return ret;
> > >                 local_lock_irqsave(&memcg_stock.stock_lock, flags);
> > >         }
> > >
> > > If that's acceptable then such an approach will work for
> > > my slub.c reentrance changes too.
> >
> > It certainly is acceptable for me.
> 
> Great.
> 
> > Do not forget to add another hunk to
> > avoid charging the full batch in this case.
> 
> Well. It looks like you spotted the existing bug ?
> 
> Instead of
> +       if (!gfpflags_allow_blockingk(gfp_mask))
> +               batch = nr_pages;
> 
> it should be unconditional:
> 
> +               batch = nr_pages;
> 
> after consume_stock() returns false.
> 
> Consider:
>         stock_pages = READ_ONCE(stock->nr_pages);
>         if (memcg == READ_ONCE(stock->cached) && stock_pages >= nr_pages) {
> 
> stock_pages == 10
> nr_pages == 20
> 
> so after consume_stock() returns false
> the batch will stay == MEMCG_CHARGE_BATCH == 64
> and
> page_counter_try_charge(&memcg->memsw, batch,...
> 
> will charge too much ?
> 
> and the bug was there for a long time.
> 
> Johaness,
> 
> looks like it's mostly your code?
> 
> Pls help us out.

I think the code is fine as the overcharge amount will be refilled into
the stock (old one will be flushed). 

	if (gfpflags_allow_spinning(gfp_mask))
		batch = nr_pages;

The above code will just avoid the refill and flushing the older stock.
Maybe Michal's suggestion is due to that reason.

BTW after the done_restock tag in try_charge_memcg(), we will another
gfpflags_allow_spinning() check to avoid schedule_work() and
mem_cgroup_handle_over_high(). Maybe simply return early for
gfpflags_allow_spinning() without checking high marks.
Michal Hocko Dec. 21, 2024, 7:20 a.m. UTC | #10
On Fri 20-12-24 11:45:47, Shakeel Butt wrote:
[...]
> I think the code is fine as the overcharge amount will be refilled into
> the stock (old one will be flushed). 
> 
> 	if (gfpflags_allow_spinning(gfp_mask))
> 		batch = nr_pages;
> 
> The above code will just avoid the refill and flushing the older stock.
> Maybe Michal's suggestion is due to that reason.

Exactly. This surely begs for a comment to explain that.

> BTW after the done_restock tag in try_charge_memcg(), we will another
> gfpflags_allow_spinning() check to avoid schedule_work() and
> mem_cgroup_handle_over_high(). Maybe simply return early for
> gfpflags_allow_spinning() without checking high marks.

Right you are.

Btw. I will be mostly offline until Jan 6.
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7b3503d12aaf..f168d223375f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1756,7 +1756,8 @@  static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
  *
  * returns true if successful, false otherwise.
  */
-static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
+			  gfp_t gfp_mask)
 {
 	struct memcg_stock_pcp *stock;
 	unsigned int stock_pages;
@@ -1766,7 +1767,11 @@  static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 	if (nr_pages > MEMCG_CHARGE_BATCH)
 		return ret;
 
-	local_lock_irqsave(&memcg_stock.stock_lock, flags);
+	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+		if (gfp_mask & __GFP_TRYLOCK)
+			return ret;
+		local_lock_irqsave(&memcg_stock.stock_lock, flags);
+	}
 
 	stock = this_cpu_ptr(&memcg_stock);
 	stock_pages = READ_ONCE(stock->nr_pages);
@@ -1851,7 +1856,14 @@  static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	unsigned long flags;
 
-	local_lock_irqsave(&memcg_stock.stock_lock, flags);
+	if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+		/*
+		 * In case of unlikely failure to lock percpu stock_lock
+		 * uncharge memcg directly.
+		 */
+		mem_cgroup_cancel_charge(memcg, nr_pages);
+		return;
+	}
 	__refill_stock(memcg, nr_pages);
 	local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
 }
@@ -2196,7 +2208,7 @@  int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	unsigned long pflags;
 
 retry:
-	if (consume_stock(memcg, nr_pages))
+	if (consume_stock(memcg, nr_pages, gfp_mask))
 		return 0;
 
 	if (!do_memsw_account() ||