diff mbox series

[RFC,1/4] fs/locks: Fix file lock cache accounting, again

Message ID ac84a832feba5418e1b58d1c7f3fe6cc7bc1de58.1705507931.git.jpoimboe@kernel.org (mailing list archive)
State New
Headers show
Series Fix file lock cache accounting, again | expand

Commit Message

Josh Poimboeuf Jan. 17, 2024, 4:14 p.m. UTC
A container can exceed its memcg limits by allocating a bunch of file
locks.

This bug was originally fixed by commit 0f12156dff28 ("memcg: enable
accounting for file lock caches"), but was later reverted by commit
3754707bcc3e ("Revert "memcg: enable accounting for file lock caches"")
due to performance issues.

Unfortunately those performance issues were never addressed and the bug
has remained unfixed for over two years.

Fix it by default but allow users to disable it with a cmdline option
(flock_accounting=off).

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         | 17 +++++++++++
 fs/locks.c                                    | 30 +++++++++++++++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

Comments

Jeff Layton Jan. 17, 2024, 7 p.m. UTC | #1
On Wed, 2024-01-17 at 08:14 -0800, Josh Poimboeuf wrote:
> A container can exceed its memcg limits by allocating a bunch of file
> locks.
> 
> This bug was originally fixed by commit 0f12156dff28 ("memcg: enable
> accounting for file lock caches"), but was later reverted by commit
> 3754707bcc3e ("Revert "memcg: enable accounting for file lock caches"")
> due to performance issues.
> 
> Unfortunately those performance issues were never addressed and the bug
> has remained unfixed for over two years.
> 
> Fix it by default but allow users to disable it with a cmdline option
> (flock_accounting=off).
> 
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  .../admin-guide/kernel-parameters.txt         | 17 +++++++++++
>  fs/locks.c                                    | 30 +++++++++++++++++--
>  2 files changed, 45 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 6ee0f9a5da70..91987b06bc52 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -1527,6 +1527,23 @@
>  			See Documentation/admin-guide/sysctl/net.rst for
>  			fb_tunnels_only_for_init_ns
>  
> +	flock_accounting=
> +			[KNL] Enable/disable accounting for kernel
> +			memory allocations related to file locks.
> +			Format: { on | off }
> +			Default: on
> +			on:	Enable kernel memory accounting for file
> +				locks.  This prevents task groups from
> +				exceeding their memcg allocation limits.
> +				However, it may cause slowdowns in the
> +				flock() system call.
> +			off:	Disable kernel memory accounting for
> +				file locks.  This may allow a rogue task
> +				to DoS the system by forcing the kernel
> +				to allocate memory beyond the task
> +				group's memcg limits.  Not recommended
> +				unless you have trusted user space.
> +
>  	floppy=		[HW]
>  			See Documentation/admin-guide/blockdev/floppy.rst.
>  
> diff --git a/fs/locks.c b/fs/locks.c
> index cc7c117ee192..235ac56c557d 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -2905,15 +2905,41 @@ static int __init proc_locks_init(void)
>  fs_initcall(proc_locks_init);
>  #endif
>  
> +static bool flock_accounting __ro_after_init = true;
> +
> +static int __init flock_accounting_cmdline(char *str)
> +{
> +	if (!str)
> +		return -EINVAL;
> +
> +	if (!strcmp(str, "off"))
> +		flock_accounting = false;
> +	else if (!strcmp(str, "on"))
> +		flock_accounting = true;
> +	else
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +early_param("flock_accounting", flock_accounting_cmdline);
> +
> +#define FLOCK_ACCOUNTING_MSG "WARNING: File lock accounting is disabled, container-triggered host memory exhaustion possible!\n"
> +
>  static int __init filelock_init(void)
>  {
>  	int i;
> +	slab_flags_t flags = SLAB_PANIC;
> +
> +	if (!flock_accounting)
> +		pr_err(FLOCK_ACCOUNTING_MSG);
> +	else
> +		flags |= SLAB_ACCOUNT;
>  
>  	flctx_cache = kmem_cache_create("file_lock_ctx",
> -			sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
> +			sizeof(struct file_lock_context), 0, flags, NULL);
>  
>  	filelock_cache = kmem_cache_create("file_lock_cache",
> -			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
> +			sizeof(struct file_lock), 0, flags, NULL);
>  
>  	for_each_possible_cpu(i) {
>  		struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);

I'm really not a fan of tunables or different kconfig options,
especially for something niche like this.

I also question whether this accounting will show up under any real-
world workloads, and whether it was just wrong to revert those patches
back in 2021.

File locking is an activity where we inherently expect to block. Ideally
we don't if the lock is uncontended of course, but it's always a
possibility.

The benchmark that prompted the regression basically just tries to
create and release a bunch of file locks as quickly as possible.
Legitimate applications that do a lot of very rapid locking like this
benchmark are basically non-existent. Usually the pattern is:

    acquire lock
    do some (relatively slow) I/O
    release lock

In that sort of scenario, is this memcg accounting more than just line
noise? I wonder whether we should just bite the bullet and see whether
there are any real workloads that suffer due to SLAB_ACCOUNT being
enabled on these caches?
Josh Poimboeuf Jan. 17, 2024, 7:39 p.m. UTC | #2
On Wed, Jan 17, 2024 at 02:00:55PM -0500, Jeff Layton wrote:
> I'm really not a fan of tunables or different kconfig options,
> especially for something niche like this.
> 
> I also question whether this accounting will show up under any real-
> world workloads, and whether it was just wrong to revert those patches
> back in 2021.
> 
> File locking is an activity where we inherently expect to block. Ideally
> we don't if the lock is uncontended of course, but it's always a
> possibility.
> 
> The benchmark that prompted the regression basically just tries to
> create and release a bunch of file locks as quickly as possible.
> Legitimate applications that do a lot of very rapid locking like this
> benchmark are basically non-existent. Usually the pattern is:
> 
>     acquire lock
>     do some (relatively slow) I/O
>     release lock
> 
> In that sort of scenario, is this memcg accounting more than just line
> noise? I wonder whether we should just bite the bullet and see whether
> there are any real workloads that suffer due to SLAB_ACCOUNT being
> enabled on these caches?

That's a good point.  If the microbenchmark isn't likely to be even
remotely realistic, maybe we should just revert the revert until if/when
somebody shows a real world impact.

Linus, any objections to that?
Linus Torvalds Jan. 17, 2024, 8:20 p.m. UTC | #3
On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> That's a good point.  If the microbenchmark isn't likely to be even
> remotely realistic, maybe we should just revert the revert until if/when
> somebody shows a real world impact.
>
> Linus, any objections to that?

We use SLAB_ACCOUNT for much more common allocations like queued
signals, so I would tend to agree with Jeff that it's probably just
some not very interesting microbenchmark that shows any file locking
effects from SLAB_ALLOC, not any real use.

That said, those benchmarks do matter. It's very easy to say "not
relevant in the big picture" and then the end result is that
everything is a bit of a pig.

And the regression was absolutely *ENORMOUS*. We're not talking "a few
percent". We're talking a 33% regression that caused the revert:

   https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/

I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
single allocation, it would be much nicer to account at a bigger
granularity, possibly by having per-thread counters first before
falling back to the obj_cgroup_charge. Whatever.

It's kind of stupid to have a benchmark that just allocates and
deallocates a file lock in quick succession spend lots of time
incrementing and decrementing cgroup charges for that repeated
alloc/free.

However, that problem with SLAB_ACCOUNT is not the fault of file
locking, but more of a slab issue.

End result: I think we should bring in Vlastimil and whoever else is
doing SLAB_ACCOUNT things, and have them look at that side.

And then just enable SLAB_ACCOUNT for file locks. But very much look
at silly costs in SLAB_ACCOUNT first, at least for trivial
"alloc/free" patterns..

Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
thing? See commit 3754707bcc3e (Revert "memcg: enable accounting for
file lock caches") for the history here.

                 Linus
Shakeel Butt Jan. 17, 2024, 9:02 p.m. UTC | #4
On Wed, Jan 17, 2024 at 12:21 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > That's a good point.  If the microbenchmark isn't likely to be even
> > remotely realistic, maybe we should just revert the revert until if/when
> > somebody shows a real world impact.
> >
> > Linus, any objections to that?
>
> We use SLAB_ACCOUNT for much more common allocations like queued
> signals, so I would tend to agree with Jeff that it's probably just
> some not very interesting microbenchmark that shows any file locking
> effects from SLAB_ALLOC, not any real use.
>
> That said, those benchmarks do matter. It's very easy to say "not
> relevant in the big picture" and then the end result is that
> everything is a bit of a pig.
>
> And the regression was absolutely *ENORMOUS*. We're not talking "a few
> percent". We're talking a 33% regression that caused the revert:
>
>    https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/
>
> I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
> single allocation, it would be much nicer to account at a bigger
> granularity, possibly by having per-thread counters first before
> falling back to the obj_cgroup_charge. Whatever.
>
> It's kind of stupid to have a benchmark that just allocates and
> deallocates a file lock in quick succession spend lots of time
> incrementing and decrementing cgroup charges for that repeated
> alloc/free.
>
> However, that problem with SLAB_ACCOUNT is not the fault of file
> locking, but more of a slab issue.
>
> End result: I think we should bring in Vlastimil and whoever else is
> doing SLAB_ACCOUNT things, and have them look at that side.
>
> And then just enable SLAB_ACCOUNT for file locks. But very much look
> at silly costs in SLAB_ACCOUNT first, at least for trivial
> "alloc/free" patterns..
>
> Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
> thing? See commit 3754707bcc3e (Revert "memcg: enable accounting for
> file lock caches") for the history here.
>

Roman last looked into optimizing this code path. I suspect
mod_objcg_state() to be more costly than obj_cgroup_charge(). I will
try to measure this path and see if I can improve it.
Vlastimil Babka Jan. 17, 2024, 9:19 p.m. UTC | #5
On 1/17/24 21:20, Linus Torvalds wrote:
> On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>>
>> That's a good point.  If the microbenchmark isn't likely to be even
>> remotely realistic, maybe we should just revert the revert until if/when
>> somebody shows a real world impact.
>>
>> Linus, any objections to that?
> 
> We use SLAB_ACCOUNT for much more common allocations like queued
> signals, so I would tend to agree with Jeff that it's probably just
> some not very interesting microbenchmark that shows any file locking
> effects from SLAB_ALLOC, not any real use.
> 
> That said, those benchmarks do matter. It's very easy to say "not
> relevant in the big picture" and then the end result is that
> everything is a bit of a pig.
> 
> And the regression was absolutely *ENORMOUS*. We're not talking "a few
> percent". We're talking a 33% regression that caused the revert:
> 
>    https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/
> 
> I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
> single allocation, it would be much nicer to account at a bigger
> granularity, possibly by having per-thread counters first before
> falling back to the obj_cgroup_charge. Whatever.

Counters are one thing (afaik some batching happens on the memcg side via
"stocks"), but another is associating the memcg with the allocated objects
in slab pages, so kmem_cache_free() knows which counter to decrement. We'll
have to see where the overhead is today.

If there's overhead due to calls between mm/slub.c and mm/memcontrol.c we
can now reduce that with SLAB gone.

> It's kind of stupid to have a benchmark that just allocates and
> deallocates a file lock in quick succession spend lots of time
> incrementing and decrementing cgroup charges for that repeated
> alloc/free.
> 
> However, that problem with SLAB_ACCOUNT is not the fault of file
> locking, but more of a slab issue.
> 
> End result: I think we should bring in Vlastimil and whoever else is
> doing SLAB_ACCOUNT things, and have them look at that side.

Roman and Shakeel are already Cc'd. Roman recently did
https://lore.kernel.org/lkml/20231019225346.1822282-1-roman.gushchin@linux.dev/
which is mentioned in the cover letter and was merged in 6.7, but cover says
it didn't help much, too bad. So is it still 33% or how much?

> And then just enable SLAB_ACCOUNT for file locks. But very much look
> at silly costs in SLAB_ACCOUNT first, at least for trivial
> "alloc/free" patterns..
> 
> Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
> thing? See commit 3754707bcc3e (Revert "memcg: enable accounting for
> file lock caches") for the history here.
> 
>                  Linus
Roman Gushchin Jan. 17, 2024, 9:50 p.m. UTC | #6
On Wed, Jan 17, 2024 at 12:20:59PM -0800, Linus Torvalds wrote:
> On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > That's a good point.  If the microbenchmark isn't likely to be even
> > remotely realistic, maybe we should just revert the revert until if/when
> > somebody shows a real world impact.
> >
> > Linus, any objections to that?
> 
> We use SLAB_ACCOUNT for much more common allocations like queued
> signals, so I would tend to agree with Jeff that it's probably just
> some not very interesting microbenchmark that shows any file locking
> effects from SLAB_ALLOC, not any real use.
> 
> That said, those benchmarks do matter. It's very easy to say "not
> relevant in the big picture" and then the end result is that
> everything is a bit of a pig.
> 
> And the regression was absolutely *ENORMOUS*. We're not talking "a few
> percent". We're talking a 33% regression that caused the revert:
> 
>    https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/
> 
> I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
> single allocation, it would be much nicer to account at a bigger
> granularity, possibly by having per-thread counters first before
> falling back to the obj_cgroup_charge. Whatever.
> 
> It's kind of stupid to have a benchmark that just allocates and
> deallocates a file lock in quick succession spend lots of time
> incrementing and decrementing cgroup charges for that repeated
> alloc/free.
> 
> However, that problem with SLAB_ACCOUNT is not the fault of file
> locking, but more of a slab issue.
> 
> End result: I think we should bring in Vlastimil and whoever else is
> doing SLAB_ACCOUNT things, and have them look at that side.
> 
> And then just enable SLAB_ACCOUNT for file locks. But very much look
> at silly costs in SLAB_ACCOUNT first, at least for trivial
> "alloc/free" patterns..
> 
> Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
> thing?

Probably me.

I recently did some work on improving the kmem accounting performance,
which is mentioned in this thread and shaves off about 30%:
https://lore.kernel.org/lkml/20231019225346.1822282-1-roman.gushchin@linux.dev/

Overall the SLAB_ACCOUNT overhead looks big on micro-benchmarks simple because
SLAB allocation path is really fast, so even touching a per-cpu variable adds
a noticeable overhead. There is nothing particularly slow on the kmem allocation
and release paths, but saving a memcg/objcg pointer, bumping the charge
and stats adds up, even though we have batching in place.

I believe the only real way to make it significantly faster is to cache
pre-charged slab objects, but it adds to the complexity and increases the memory
footprint. So far it was all about micro-benchmarks, I haven't seen any
complaints on the performance of real workloads.

Thanks!
Roman Gushchin Jan. 17, 2024, 10:20 p.m. UTC | #7
On Wed, Jan 17, 2024 at 01:02:19PM -0800, Shakeel Butt wrote:
> On Wed, Jan 17, 2024 at 12:21 PM Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > >
> > > That's a good point.  If the microbenchmark isn't likely to be even
> > > remotely realistic, maybe we should just revert the revert until if/when
> > > somebody shows a real world impact.
> > >
> > > Linus, any objections to that?
> >
> > We use SLAB_ACCOUNT for much more common allocations like queued
> > signals, so I would tend to agree with Jeff that it's probably just
> > some not very interesting microbenchmark that shows any file locking
> > effects from SLAB_ALLOC, not any real use.
> >
> > That said, those benchmarks do matter. It's very easy to say "not
> > relevant in the big picture" and then the end result is that
> > everything is a bit of a pig.
> >
> > And the regression was absolutely *ENORMOUS*. We're not talking "a few
> > percent". We're talking a 33% regression that caused the revert:
> >
> >    https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/
> >
> > I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
> > single allocation, it would be much nicer to account at a bigger
> > granularity, possibly by having per-thread counters first before
> > falling back to the obj_cgroup_charge. Whatever.
> >
> > It's kind of stupid to have a benchmark that just allocates and
> > deallocates a file lock in quick succession spend lots of time
> > incrementing and decrementing cgroup charges for that repeated
> > alloc/free.
> >
> > However, that problem with SLAB_ACCOUNT is not the fault of file
> > locking, but more of a slab issue.
> >
> > End result: I think we should bring in Vlastimil and whoever else is
> > doing SLAB_ACCOUNT things, and have them look at that side.
> >
> > And then just enable SLAB_ACCOUNT for file locks. But very much look
> > at silly costs in SLAB_ACCOUNT first, at least for trivial
> > "alloc/free" patterns..
> >
> > Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
> > thing? See commit 3754707bcc3e (Revert "memcg: enable accounting for
> > file lock caches") for the history here.
> >
> 
> Roman last looked into optimizing this code path. I suspect
> mod_objcg_state() to be more costly than obj_cgroup_charge(). I will
> try to measure this path and see if I can improve it.

It's roughly an equal split between mod_objcg_state() and obj_cgroup_charge().
And each is comparable (by order of magnitude) to the slab allocation cost
itself. On the free() path a significant cost comes simple from reading
the objcg pointer (it's usually a cache miss).

So I don't see how we can make it really cheap (say, less than 5% overhead)
without caching pre-accounted objects.

I thought about merging of charge and stats handling paths, which _maybe_ can
shave off another 20-30%, but there still will be a double-digit% accounting
overhead.

I'm curious to hear other ideas and suggestions.

Thanks!
Shakeel Butt Jan. 17, 2024, 10:56 p.m. UTC | #8
On Wed, Jan 17, 2024 at 2:20 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> On Wed, Jan 17, 2024 at 01:02:19PM -0800, Shakeel Butt wrote:
> > On Wed, Jan 17, 2024 at 12:21 PM Linus Torvalds
> > <torvalds@linux-foundation.org> wrote:
> > >
> > > On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > > >
> > > > That's a good point.  If the microbenchmark isn't likely to be even
> > > > remotely realistic, maybe we should just revert the revert until if/when
> > > > somebody shows a real world impact.
> > > >
> > > > Linus, any objections to that?
> > >
> > > We use SLAB_ACCOUNT for much more common allocations like queued
> > > signals, so I would tend to agree with Jeff that it's probably just
> > > some not very interesting microbenchmark that shows any file locking
> > > effects from SLAB_ALLOC, not any real use.
> > >
> > > That said, those benchmarks do matter. It's very easy to say "not
> > > relevant in the big picture" and then the end result is that
> > > everything is a bit of a pig.
> > >
> > > And the regression was absolutely *ENORMOUS*. We're not talking "a few
> > > percent". We're talking a 33% regression that caused the revert:
> > >
> > >    https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/
> > >
> > > I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
> > > single allocation, it would be much nicer to account at a bigger
> > > granularity, possibly by having per-thread counters first before
> > > falling back to the obj_cgroup_charge. Whatever.
> > >
> > > It's kind of stupid to have a benchmark that just allocates and
> > > deallocates a file lock in quick succession spend lots of time
> > > incrementing and decrementing cgroup charges for that repeated
> > > alloc/free.
> > >
> > > However, that problem with SLAB_ACCOUNT is not the fault of file
> > > locking, but more of a slab issue.
> > >
> > > End result: I think we should bring in Vlastimil and whoever else is
> > > doing SLAB_ACCOUNT things, and have them look at that side.
> > >
> > > And then just enable SLAB_ACCOUNT for file locks. But very much look
> > > at silly costs in SLAB_ACCOUNT first, at least for trivial
> > > "alloc/free" patterns..
> > >
> > > Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
> > > thing? See commit 3754707bcc3e (Revert "memcg: enable accounting for
> > > file lock caches") for the history here.
> > >
> >
> > Roman last looked into optimizing this code path. I suspect
> > mod_objcg_state() to be more costly than obj_cgroup_charge(). I will
> > try to measure this path and see if I can improve it.
>
> It's roughly an equal split between mod_objcg_state() and obj_cgroup_charge().
> And each is comparable (by order of magnitude) to the slab allocation cost
> itself. On the free() path a significant cost comes simple from reading
> the objcg pointer (it's usually a cache miss).
>
> So I don't see how we can make it really cheap (say, less than 5% overhead)
> without caching pre-accounted objects.

Maybe this is what we want. Now we are down to just SLUB, maybe such
caching of pre-accounted objects can be in SLUB layer and we can
decide to keep this caching per-kmem-cache opt-in or always on.

>
> I thought about merging of charge and stats handling paths, which _maybe_ can
> shave off another 20-30%, but there still will be a double-digit% accounting
> overhead.
>
> I'm curious to hear other ideas and suggestions.
>
> Thanks!
Michal Hocko Jan. 18, 2024, 9:49 a.m. UTC | #9
On Wed 17-01-24 14:00:55, Jeff Layton wrote:
> I'm really not a fan of tunables or different kconfig options,
> especially for something niche like this.

I completely agree with that. We do have plethora of kernel objects
which are accounted and we really do not want to have a kernel cmd line
option for many/each of them. Kernel memory accounting can be disabled
through kernel cmd line already.
Shakeel Butt Jan. 19, 2024, 7:47 a.m. UTC | #10
On Wed, Jan 17, 2024 at 2:20 PM Roman Gushchin <roman.gushchin@linux.dev> wrote:
>
> On Wed, Jan 17, 2024 at 01:02:19PM -0800, Shakeel Butt wrote:
> > On Wed, Jan 17, 2024 at 12:21 PM Linus Torvalds
> > <torvalds@linux-foundation.org> wrote:
> > >
> > > On Wed, 17 Jan 2024 at 11:39, Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > > >
> > > > That's a good point.  If the microbenchmark isn't likely to be even
> > > > remotely realistic, maybe we should just revert the revert until if/when
> > > > somebody shows a real world impact.
> > > >
> > > > Linus, any objections to that?
> > >
> > > We use SLAB_ACCOUNT for much more common allocations like queued
> > > signals, so I would tend to agree with Jeff that it's probably just
> > > some not very interesting microbenchmark that shows any file locking
> > > effects from SLAB_ALLOC, not any real use.
> > >
> > > That said, those benchmarks do matter. It's very easy to say "not
> > > relevant in the big picture" and then the end result is that
> > > everything is a bit of a pig.
> > >
> > > And the regression was absolutely *ENORMOUS*. We're not talking "a few
> > > percent". We're talking a 33% regression that caused the revert:
> > >
> > >    https://lore.kernel.org/lkml/20210907150757.GE17617@xsang-OptiPlex-9020/
> > >
> > > I wish our SLAB_ACCOUNT wasn't such a pig. Rather than account every
> > > single allocation, it would be much nicer to account at a bigger
> > > granularity, possibly by having per-thread counters first before
> > > falling back to the obj_cgroup_charge. Whatever.
> > >
> > > It's kind of stupid to have a benchmark that just allocates and
> > > deallocates a file lock in quick succession spend lots of time
> > > incrementing and decrementing cgroup charges for that repeated
> > > alloc/free.
> > >
> > > However, that problem with SLAB_ACCOUNT is not the fault of file
> > > locking, but more of a slab issue.
> > >
> > > End result: I think we should bring in Vlastimil and whoever else is
> > > doing SLAB_ACCOUNT things, and have them look at that side.
> > >
> > > And then just enable SLAB_ACCOUNT for file locks. But very much look
> > > at silly costs in SLAB_ACCOUNT first, at least for trivial
> > > "alloc/free" patterns..
> > >
> > > Vlastimil? Who would be the best person to look at that SLAB_ACCOUNT
> > > thing? See commit 3754707bcc3e (Revert "memcg: enable accounting for
> > > file lock caches") for the history here.
> > >
> >
> > Roman last looked into optimizing this code path. I suspect
> > mod_objcg_state() to be more costly than obj_cgroup_charge(). I will
> > try to measure this path and see if I can improve it.
>
> It's roughly an equal split between mod_objcg_state() and obj_cgroup_charge().
> And each is comparable (by order of magnitude) to the slab allocation cost
> itself. On the free() path a significant cost comes simple from reading
> the objcg pointer (it's usually a cache miss).
>
> So I don't see how we can make it really cheap (say, less than 5% overhead)
> without caching pre-accounted objects.
>
> I thought about merging of charge and stats handling paths, which _maybe_ can
> shave off another 20-30%, but there still will be a double-digit% accounting
> overhead.
>
> I'm curious to hear other ideas and suggestions.
>
> Thanks!

I profiled (perf record -a) the same benchmark i.e. lock1_processes on
an icelake machine with 72 cores and got the following results:

  12.72%  lock1_processes  [kernel.kallsyms]   [k] mod_objcg_state
  10.89%  lock1_processes  [kernel.kallsyms]   [k] kmem_cache_free
   8.40%  lock1_processes  [kernel.kallsyms]   [k] slab_post_alloc_hook
   8.36%  lock1_processes  [kernel.kallsyms]   [k] kmem_cache_alloc
   5.18%  lock1_processes  [kernel.kallsyms]   [k] refill_obj_stock
   5.18%  lock1_processes  [kernel.kallsyms]   [k] _copy_from_user

On annotating mod_objcg_state(), the following irq disabling
instructions are taking 30% of its time.

  6.64 │       pushfq
 10.26│       popq   -0x38(%rbp)
  6.05 │       mov    -0x38(%rbp),%rcx
  7.60 │       cli

For kmem_cache_free() & kmem_cache_alloc(), the following instruction
was expensive, which corresponds to __update_cpu_freelist_fast().

 16.33 │      cmpxchg16b %gs:(%rsi)

For slab_post_alloc_hook(), it's all over the place and
refill_obj_stock() is very similar to mod_objcg_state().

I will dig more in the next couple of days.
Linus Torvalds Jan. 22, 2024, 5:10 a.m. UTC | #11
On Wed, 17 Jan 2024 at 14:56, Shakeel Butt <shakeelb@google.com> wrote:
> >
> > So I don't see how we can make it really cheap (say, less than 5% overhead)
> > without caching pre-accounted objects.
>
> Maybe this is what we want. Now we are down to just SLUB, maybe such
> caching of pre-accounted objects can be in SLUB layer and we can
> decide to keep this caching per-kmem-cache opt-in or always on.

So it turns out that we have another case of SLAB_ACCOUNT being quite
a big expense, and it's actually the normal - but failed - open() or
execve() case.

See the thread at

    https://lore.kernel.org/all/CAHk-=whw936qzDLBQdUz-He5WK_0fRSWwKAjtbVsMGfX70Nf_Q@mail.gmail.com/

and to see the effect in profiles, you can use this EXTREMELY stupid
test program:

    #include <fcntl.h>

    int main(int argc, char **argv)
    {
        for (int i = 0; i < 10000000; i++)
                open("nonexistent", O_RDONLY);
    }

where the point of course is that the "nonexistent" pathname doesn't
actually exist (so don't create a file called that for the test).

What happens is that open() allocates a 'struct file *' early from the
filp kmem_cache, which has SLAB_ACCOUNT set. So we'll do accounting
for it, failt the pathname open, and free it again, which uncharges
the accounting.

Now, in this case, I actually have a suggestion: could we please just
make SLAB_ACCOUNT be something that we do *after* the allocation, kind
of the same way the zeroing works?

IOW, I'd love to get rid of slab_pre_alloc_hook() entirely, and make
slab_post_alloc_hook() do all the "charge the memcg if required".

Obviously that means that now a failure to charge the memcg would have
to then de-allocate things, but that's an uncommon path and would be
marked unlikely and not be in the hot path at all.

Now, the reason I would prefer that is that the *second* step would be to

 (a) expose a "kmem_cache_charge()" function that takes a
*non*-accounted slab allocation, and turns it into an accounted one
(and obviously this is why you want to do everything in the post-alloc
hook: just try to share this code)

 (b) remote the SLAB_ACCOUNT from the filp_cachep, making all file
allocations start out unaccounted.

 (c) when we have *actually* looked up the pathname and open the file
successfully, at *that* point we'd do a

        error = kmem_cache_charge(filp_cachep, file);

    in do_dentry_open() to turn the unaccounted file pointer into an
accounted one (and if that fails, we do the cleanup and free it, of
course, exactly like we do when file_get_write_access() fails)

which means that now the failure case doesn't unnecessarily charge the
allocation that never ends up being finalized.

NOTE! I think this would clean up mm/slub.c too, simply because it
would get rid of that memcg_slab_pre_alloc_hook() entirely, and get
rid of the need to carry the "struct obj_cgroup **objcgp" pointer
along until the post-alloc hook: everything would be done post-alloc.

The actual kmem_cache_free() code already deals with "this slab hasn't
been accounted" because it obviously has to deal with allocations that
were done without __GFP_ACCOUNT anyway. So there's no change needed on
the freeing path, it already has to handle this all gracefully.

I may be missing something, but it would seem to have very little
downside, and fix a case that actually is visible right now.

              Linus
Shakeel Butt Jan. 22, 2024, 5:38 p.m. UTC | #12
Hi Linus,

On Sun, Jan 21, 2024 at 9:16 PM Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Wed, 17 Jan 2024 at 14:56, Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > So I don't see how we can make it really cheap (say, less than 5% overhead)
> > > without caching pre-accounted objects.
> >
> > Maybe this is what we want. Now we are down to just SLUB, maybe such
> > caching of pre-accounted objects can be in SLUB layer and we can
> > decide to keep this caching per-kmem-cache opt-in or always on.
>
> So it turns out that we have another case of SLAB_ACCOUNT being quite
> a big expense, and it's actually the normal - but failed - open() or
> execve() case.
>
> See the thread at
>
>     https://lore.kernel.org/all/CAHk-=whw936qzDLBQdUz-He5WK_0fRSWwKAjtbVsMGfX70Nf_Q@mail.gmail.com/
>
> and to see the effect in profiles, you can use this EXTREMELY stupid
> test program:
>
>     #include <fcntl.h>
>
>     int main(int argc, char **argv)
>     {
>         for (int i = 0; i < 10000000; i++)
>                 open("nonexistent", O_RDONLY);
>     }
>
> where the point of course is that the "nonexistent" pathname doesn't
> actually exist (so don't create a file called that for the test).
>
> What happens is that open() allocates a 'struct file *' early from the
> filp kmem_cache, which has SLAB_ACCOUNT set. So we'll do accounting
> for it, failt the pathname open, and free it again, which uncharges
> the accounting.
>
> Now, in this case, I actually have a suggestion: could we please just
> make SLAB_ACCOUNT be something that we do *after* the allocation, kind
> of the same way the zeroing works?
>
> IOW, I'd love to get rid of slab_pre_alloc_hook() entirely, and make
> slab_post_alloc_hook() do all the "charge the memcg if required".
>
> Obviously that means that now a failure to charge the memcg would have
> to then de-allocate things, but that's an uncommon path and would be
> marked unlikely and not be in the hot path at all.
>
> Now, the reason I would prefer that is that the *second* step would be to
>
>  (a) expose a "kmem_cache_charge()" function that takes a
> *non*-accounted slab allocation, and turns it into an accounted one
> (and obviously this is why you want to do everything in the post-alloc
> hook: just try to share this code)
>
>  (b) remote the SLAB_ACCOUNT from the filp_cachep, making all file
> allocations start out unaccounted.
>
>  (c) when we have *actually* looked up the pathname and open the file
> successfully, at *that* point we'd do a
>
>         error = kmem_cache_charge(filp_cachep, file);
>
>     in do_dentry_open() to turn the unaccounted file pointer into an
> accounted one (and if that fails, we do the cleanup and free it, of
> course, exactly like we do when file_get_write_access() fails)
>
> which means that now the failure case doesn't unnecessarily charge the
> allocation that never ends up being finalized.
>
> NOTE! I think this would clean up mm/slub.c too, simply because it
> would get rid of that memcg_slab_pre_alloc_hook() entirely, and get
> rid of the need to carry the "struct obj_cgroup **objcgp" pointer
> along until the post-alloc hook: everything would be done post-alloc.
>
> The actual kmem_cache_free() code already deals with "this slab hasn't
> been accounted" because it obviously has to deal with allocations that
> were done without __GFP_ACCOUNT anyway. So there's no change needed on
> the freeing path, it already has to handle this all gracefully.
>
> I may be missing something, but it would seem to have very little
> downside, and fix a case that actually is visible right now.
>

Thanks for the idea. Actually I had a discussion with Vlastimil at LPC
'22 on a similar idea but for a different problem i.e. allocation in
irq context or more specifically charging sockets for incoming TCP
connections. If you see inet_csk_accept() we solved this for TCP
memory by charging at accept() syscall but the kernel memory holding
socket is still uncharged.

So, I think having the framework you suggested would help more
use-cases. I will take a stab at this in the next couple of days
(sorry stuck in some other stuff atm).

thanks,
Shakeel
Vlastimil Babka Jan. 26, 2024, 9:50 a.m. UTC | #13
On 1/22/24 06:10, Linus Torvalds wrote:
> On Wed, 17 Jan 2024 at 14:56, Shakeel Butt <shakeelb@google.com> wrote:
>> >
>> > So I don't see how we can make it really cheap (say, less than 5% overhead)
>> > without caching pre-accounted objects.
>>
>> Maybe this is what we want. Now we are down to just SLUB, maybe such
>> caching of pre-accounted objects can be in SLUB layer and we can
>> decide to keep this caching per-kmem-cache opt-in or always on.
> 
> So it turns out that we have another case of SLAB_ACCOUNT being quite
> a big expense, and it's actually the normal - but failed - open() or
> execve() case.
> 
> See the thread at
> 
>     https://lore.kernel.org/all/CAHk-=whw936qzDLBQdUz-He5WK_0fRSWwKAjtbVsMGfX70Nf_Q@mail.gmail.com/
> 
> and to see the effect in profiles, you can use this EXTREMELY stupid
> test program:
> 
>     #include <fcntl.h>
> 
>     int main(int argc, char **argv)
>     {
>         for (int i = 0; i < 10000000; i++)
>                 open("nonexistent", O_RDONLY);
>     }

This reminded me I can see should_failslab() in the profiles (1.43% plus the
overhead in its caller) even if it does nothing at all, and it's completely
unconditional since commit 4f6923fbb352 ("mm: make should_failslab always
available for fault injection").

We discussed it briefly when Jens tried to change it in [1] to depend on
CONFIG_FAILSLAB again. But now I think it should be even possible to leave
it always available, but behind a static key. BPF or whoever else uses these
error injection hooks would have to track how many users of them are active
and manage the static key accordingly. Then it could be always available,
but have no overhead when there's no active user? Other similars hooks could
benefit from such an approach too?

[1]
https://lore.kernel.org/all/e01e5e40-692a-519c-4cba-e3331f173c82@kernel.dk/#t


> where the point of course is that the "nonexistent" pathname doesn't
> actually exist (so don't create a file called that for the test).
> 
> What happens is that open() allocates a 'struct file *' early from the
> filp kmem_cache, which has SLAB_ACCOUNT set. So we'll do accounting
> for it, failt the pathname open, and free it again, which uncharges
> the accounting.
> 
> Now, in this case, I actually have a suggestion: could we please just
> make SLAB_ACCOUNT be something that we do *after* the allocation, kind
> of the same way the zeroing works?
> 
> IOW, I'd love to get rid of slab_pre_alloc_hook() entirely, and make
> slab_post_alloc_hook() do all the "charge the memcg if required".
> 
> Obviously that means that now a failure to charge the memcg would have
> to then de-allocate things, but that's an uncommon path and would be
> marked unlikely and not be in the hot path at all.
> 
> Now, the reason I would prefer that is that the *second* step would be to
> 
>  (a) expose a "kmem_cache_charge()" function that takes a
> *non*-accounted slab allocation, and turns it into an accounted one
> (and obviously this is why you want to do everything in the post-alloc
> hook: just try to share this code)
> 
>  (b) remote the SLAB_ACCOUNT from the filp_cachep, making all file
> allocations start out unaccounted.
> 
>  (c) when we have *actually* looked up the pathname and open the file
> successfully, at *that* point we'd do a
> 
>         error = kmem_cache_charge(filp_cachep, file);
> 
>     in do_dentry_open() to turn the unaccounted file pointer into an
> accounted one (and if that fails, we do the cleanup and free it, of
> course, exactly like we do when file_get_write_access() fails)
> 
> which means that now the failure case doesn't unnecessarily charge the
> allocation that never ends up being finalized.
> 
> NOTE! I think this would clean up mm/slub.c too, simply because it
> would get rid of that memcg_slab_pre_alloc_hook() entirely, and get
> rid of the need to carry the "struct obj_cgroup **objcgp" pointer
> along until the post-alloc hook: everything would be done post-alloc.
> 
> The actual kmem_cache_free() code already deals with "this slab hasn't
> been accounted" because it obviously has to deal with allocations that
> were done without __GFP_ACCOUNT anyway. So there's no change needed on
> the freeing path, it already has to handle this all gracefully.
> 
> I may be missing something, but it would seem to have very little
> downside, and fix a case that actually is visible right now.
> 
>               Linus
Vlastimil Babka Jan. 30, 2024, 11:04 a.m. UTC | #14
On 1/26/24 10:50, Vlastimil Babka wrote:
> On 1/22/24 06:10, Linus Torvalds wrote:
>> On Wed, 17 Jan 2024 at 14:56, Shakeel Butt <shakeelb@google.com> wrote:
>>> >
>>> > So I don't see how we can make it really cheap (say, less than 5% overhead)
>>> > without caching pre-accounted objects.
>>>
>>> Maybe this is what we want. Now we are down to just SLUB, maybe such
>>> caching of pre-accounted objects can be in SLUB layer and we can
>>> decide to keep this caching per-kmem-cache opt-in or always on.
>> 
>> So it turns out that we have another case of SLAB_ACCOUNT being quite
>> a big expense, and it's actually the normal - but failed - open() or
>> execve() case.
>> 
>> See the thread at
>> 
>>     https://lore.kernel.org/all/CAHk-=whw936qzDLBQdUz-He5WK_0fRSWwKAjtbVsMGfX70Nf_Q@mail.gmail.com/
>> 
>> and to see the effect in profiles, you can use this EXTREMELY stupid
>> test program:
>> 
>>     #include <fcntl.h>
>> 
>>     int main(int argc, char **argv)
>>     {
>>         for (int i = 0; i < 10000000; i++)
>>                 open("nonexistent", O_RDONLY);
>>     }
> 
> This reminded me I can see should_failslab() in the profiles (1.43% plus the
> overhead in its caller) even if it does nothing at all, and it's completely
> unconditional since commit 4f6923fbb352 ("mm: make should_failslab always
> available for fault injection").
> 
> We discussed it briefly when Jens tried to change it in [1] to depend on
> CONFIG_FAILSLAB again. But now I think it should be even possible to leave
> it always available, but behind a static key. BPF or whoever else uses these
> error injection hooks would have to track how many users of them are active
> and manage the static key accordingly. Then it could be always available,
> but have no overhead when there's no active user? Other similars hooks could
> benefit from such an approach too?
> 
> [1]
> https://lore.kernel.org/all/e01e5e40-692a-519c-4cba-e3331f173c82@kernel.dk/#t

Just for completeness, with the hunk below I've seen some 2% improvement on
the test program from Linus.
Of course something needs to operate the static key and I'm not familiar
enough with bpf and related machinery whether it's tracking users of the
injection points already where the static key toggling could be hooked.

diff --git a/mm/slub.c b/mm/slub.c
index 2ef88bbf56a3..da07b358d092 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3750,6 +3750,8 @@ noinline int should_failslab(struct kmem_cache *s, gfp_t gfpflags)
 }
 ALLOW_ERROR_INJECTION(should_failslab, ERRNO);
 
+static DEFINE_STATIC_KEY_FALSE(failslab_enabled);
+
 static __fastpath_inline
 struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
                                       struct list_lru *lru,
@@ -3760,8 +3762,10 @@ struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 
        might_alloc(flags);
 
-       if (unlikely(should_failslab(s, flags)))
-               return NULL;
+       if (static_branch_unlikely(&failslab_enabled)) {
+               if (unlikely(should_failslab(s, flags)))
+                       return NULL;
+       }
 
        if (unlikely(!memcg_slab_pre_alloc_hook(s, lru, objcgp, size, flags)))
                return NULL;
diff mbox series

Patch

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6ee0f9a5da70..91987b06bc52 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1527,6 +1527,23 @@ 
 			See Documentation/admin-guide/sysctl/net.rst for
 			fb_tunnels_only_for_init_ns
 
+	flock_accounting=
+			[KNL] Enable/disable accounting for kernel
+			memory allocations related to file locks.
+			Format: { on | off }
+			Default: on
+			on:	Enable kernel memory accounting for file
+				locks.  This prevents task groups from
+				exceeding their memcg allocation limits.
+				However, it may cause slowdowns in the
+				flock() system call.
+			off:	Disable kernel memory accounting for
+				file locks.  This may allow a rogue task
+				to DoS the system by forcing the kernel
+				to allocate memory beyond the task
+				group's memcg limits.  Not recommended
+				unless you have trusted user space.
+
 	floppy=		[HW]
 			See Documentation/admin-guide/blockdev/floppy.rst.
 
diff --git a/fs/locks.c b/fs/locks.c
index cc7c117ee192..235ac56c557d 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2905,15 +2905,41 @@  static int __init proc_locks_init(void)
 fs_initcall(proc_locks_init);
 #endif
 
+static bool flock_accounting __ro_after_init = true;
+
+static int __init flock_accounting_cmdline(char *str)
+{
+	if (!str)
+		return -EINVAL;
+
+	if (!strcmp(str, "off"))
+		flock_accounting = false;
+	else if (!strcmp(str, "on"))
+		flock_accounting = true;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+early_param("flock_accounting", flock_accounting_cmdline);
+
+#define FLOCK_ACCOUNTING_MSG "WARNING: File lock accounting is disabled, container-triggered host memory exhaustion possible!\n"
+
 static int __init filelock_init(void)
 {
 	int i;
+	slab_flags_t flags = SLAB_PANIC;
+
+	if (!flock_accounting)
+		pr_err(FLOCK_ACCOUNTING_MSG);
+	else
+		flags |= SLAB_ACCOUNT;
 
 	flctx_cache = kmem_cache_create("file_lock_ctx",
-			sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+			sizeof(struct file_lock_context), 0, flags, NULL);
 
 	filelock_cache = kmem_cache_create("file_lock_cache",
-			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+			sizeof(struct file_lock), 0, flags, NULL);
 
 	for_each_possible_cpu(i) {
 		struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);