diff mbox series

[v3,31/35] lib: add memory allocations report in show_mem()

Message ID 20240212213922.783301-32-surenb@google.com (mailing list archive)
State New
Headers show
Series Memory allocation profiling | expand

Commit Message

Suren Baghdasaryan Feb. 12, 2024, 9:39 p.m. UTC
Include allocations in show_mem reports.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/alloc_tag.h |  2 ++
 lib/alloc_tag.c           | 38 ++++++++++++++++++++++++++++++++++++++
 mm/show_mem.c             | 15 +++++++++++++++
 3 files changed, 55 insertions(+)

Comments

Kees Cook Feb. 13, 2024, 12:10 a.m. UTC | #1
On Mon, Feb 12, 2024 at 01:39:17PM -0800, Suren Baghdasaryan wrote:
> Include allocations in show_mem reports.
> 
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/alloc_tag.h |  2 ++
>  lib/alloc_tag.c           | 38 ++++++++++++++++++++++++++++++++++++++
>  mm/show_mem.c             | 15 +++++++++++++++
>  3 files changed, 55 insertions(+)
> 
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index 3fe51e67e231..0a5973c4ad77 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -30,6 +30,8 @@ struct alloc_tag {
>  
>  #ifdef CONFIG_MEM_ALLOC_PROFILING
>  
> +void alloc_tags_show_mem_report(struct seq_buf *s);
> +
>  static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
>  {
>  	return container_of(ct, struct alloc_tag, ct);
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> index 2d5226d9262d..54312c213860 100644
> --- a/lib/alloc_tag.c
> +++ b/lib/alloc_tag.c
> @@ -96,6 +96,44 @@ static const struct seq_operations allocinfo_seq_op = {
>  	.show	= allocinfo_show,
>  };
>  
> +void alloc_tags_show_mem_report(struct seq_buf *s)
> +{
> +	struct codetag_iterator iter;
> +	struct codetag *ct;
> +	struct {
> +		struct codetag		*tag;
> +		size_t			bytes;
> +	} tags[10], n;
> +	unsigned int i, nr = 0;
> +
> +	codetag_lock_module_list(alloc_tag_cttype, true);
> +	iter = codetag_get_ct_iter(alloc_tag_cttype);
> +	while ((ct = codetag_next_ct(&iter))) {
> +		struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
> +
> +		n.tag	= ct;
> +		n.bytes = counter.bytes;
> +
> +		for (i = 0; i < nr; i++)
> +			if (n.bytes > tags[i].bytes)
> +				break;
> +
> +		if (i < ARRAY_SIZE(tags)) {
> +			nr -= nr == ARRAY_SIZE(tags);
> +			memmove(&tags[i + 1],
> +				&tags[i],
> +				sizeof(tags[0]) * (nr - i));
> +			nr++;
> +			tags[i] = n;
> +		}
> +	}
> +
> +	for (i = 0; i < nr; i++)
> +		alloc_tag_to_text(s, tags[i].tag);
> +
> +	codetag_lock_module_list(alloc_tag_cttype, false);
> +}
> +
>  static void __init procfs_init(void)
>  {
>  	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> diff --git a/mm/show_mem.c b/mm/show_mem.c
> index 8dcfafbd283c..d514c15ca076 100644
> --- a/mm/show_mem.c
> +++ b/mm/show_mem.c
> @@ -12,6 +12,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/mm.h>
>  #include <linux/mmzone.h>
> +#include <linux/seq_buf.h>
>  #include <linux/swap.h>
>  #include <linux/vmstat.h>
>  
> @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>  #ifdef CONFIG_MEMORY_FAILURE
>  	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
>  #endif
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +	{
> +		struct seq_buf s;
> +		char *buf = kmalloc(4096, GFP_ATOMIC);

Why 4096? Maybe use PAGE_SIZE instead?

> +
> +		if (buf) {
> +			printk("Memory allocations:\n");

This needs a printk prefix, or better yet, just use pr_info() or similar.

> +			seq_buf_init(&s, buf, 4096);
> +			alloc_tags_show_mem_report(&s);
> +			printk("%s", buf);

Once a seq_buf "consumes" a char *, please don't use any directly any
more. This should be:

			pr_info("%s", seq_buf_str(s));

Otherwise %NUL termination isn't certain. Very likely, given the use
case here, but let's use good hygiene. :)

> +			kfree(buf);
> +		}
> +	}
> +#endif
>  }
> -- 
> 2.43.0.687.g38aa6559b0-goog
>
Steven Rostedt Feb. 13, 2024, 12:22 a.m. UTC | #2
On Mon, 12 Feb 2024 16:10:02 -0800
Kees Cook <keescook@chromium.org> wrote:

> >  #endif
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +	{
> > +		struct seq_buf s;
> > +		char *buf = kmalloc(4096, GFP_ATOMIC);  
> 
> Why 4096? Maybe use PAGE_SIZE instead?

Will it make a difference for architectures that don't have 4096 PAGE_SIZE?
Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K!

-- Steve
Kent Overstreet Feb. 13, 2024, 4:33 a.m. UTC | #3
On Mon, Feb 12, 2024 at 07:22:42PM -0500, Steven Rostedt wrote:
> On Mon, 12 Feb 2024 16:10:02 -0800
> Kees Cook <keescook@chromium.org> wrote:
> 
> > >  #endif
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +	{
> > > +		struct seq_buf s;
> > > +		char *buf = kmalloc(4096, GFP_ATOMIC);  
> > 
> > Why 4096? Maybe use PAGE_SIZE instead?
> 
> Will it make a difference for architectures that don't have 4096 PAGE_SIZE?
> Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K!

it's just a string buffer
Suren Baghdasaryan Feb. 13, 2024, 8:17 a.m. UTC | #4
On Mon, Feb 12, 2024 at 8:33 PM Kent Overstreet
<kent.overstreet@linux.dev> wrote:
>
> On Mon, Feb 12, 2024 at 07:22:42PM -0500, Steven Rostedt wrote:
> > On Mon, 12 Feb 2024 16:10:02 -0800
> > Kees Cook <keescook@chromium.org> wrote:
> >
> > > >  #endif
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > + {
> > > > +         struct seq_buf s;
> > > > +         char *buf = kmalloc(4096, GFP_ATOMIC);
> > >
> > > Why 4096? Maybe use PAGE_SIZE instead?
> >
> > Will it make a difference for architectures that don't have 4096 PAGE_SIZE?
> > Like PowerPC which has PAGE_SIZE of anywhere between 4K to 256K!
>
> it's just a string buffer

We should document that __show_mem() prints only the top 10 largest
allocations, therefore as long as this buffer is large enough to hold
10 records we should be good. Technically we could simply print one
record at a time and then the buffer can be smaller.
Michal Hocko Feb. 15, 2024, 9:22 a.m. UTC | #5
On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
[...]
> @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>  #ifdef CONFIG_MEMORY_FAILURE
>  	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
>  #endif
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +	{
> +		struct seq_buf s;
> +		char *buf = kmalloc(4096, GFP_ATOMIC);
> +
> +		if (buf) {
> +			printk("Memory allocations:\n");
> +			seq_buf_init(&s, buf, 4096);
> +			alloc_tags_show_mem_report(&s);
> +			printk("%s", buf);
> +			kfree(buf);
> +		}
> +	}
> +#endif

I am pretty sure I have already objected to this. Memory allocations in
the oom path are simply no go unless there is absolutely no other way
around that. In this case the buffer could be preallocated.
Suren Baghdasaryan Feb. 15, 2024, 2:58 p.m. UTC | #6
On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> [...]
> > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> >  #ifdef CONFIG_MEMORY_FAILURE
> >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> >  #endif
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +     {
> > +             struct seq_buf s;
> > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > +
> > +             if (buf) {
> > +                     printk("Memory allocations:\n");
> > +                     seq_buf_init(&s, buf, 4096);
> > +                     alloc_tags_show_mem_report(&s);
> > +                     printk("%s", buf);
> > +                     kfree(buf);
> > +             }
> > +     }
> > +#endif
>
> I am pretty sure I have already objected to this. Memory allocations in
> the oom path are simply no go unless there is absolutely no other way
> around that. In this case the buffer could be preallocated.

Good point. We will change this to a smaller buffer allocated on the
stack and will print records one-by-one. Thanks!

>
> --
> Michal Hocko
> SUSE Labs
Michal Hocko Feb. 15, 2024, 4:44 p.m. UTC | #7
On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > [...]
> > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > >  #ifdef CONFIG_MEMORY_FAILURE
> > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > >  #endif
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +     {
> > > +             struct seq_buf s;
> > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > +
> > > +             if (buf) {
> > > +                     printk("Memory allocations:\n");
> > > +                     seq_buf_init(&s, buf, 4096);
> > > +                     alloc_tags_show_mem_report(&s);
> > > +                     printk("%s", buf);
> > > +                     kfree(buf);
> > > +             }
> > > +     }
> > > +#endif
> >
> > I am pretty sure I have already objected to this. Memory allocations in
> > the oom path are simply no go unless there is absolutely no other way
> > around that. In this case the buffer could be preallocated.
> 
> Good point. We will change this to a smaller buffer allocated on the
> stack and will print records one-by-one. Thanks!

__show_mem could be called with a very deep call chains. A single
pre-allocated buffer should just do ok.
Suren Baghdasaryan Feb. 15, 2024, 4:47 p.m. UTC | #8
On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > [...]
> > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > >  #endif
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > +     {
> > > > +             struct seq_buf s;
> > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > +
> > > > +             if (buf) {
> > > > +                     printk("Memory allocations:\n");
> > > > +                     seq_buf_init(&s, buf, 4096);
> > > > +                     alloc_tags_show_mem_report(&s);
> > > > +                     printk("%s", buf);
> > > > +                     kfree(buf);
> > > > +             }
> > > > +     }
> > > > +#endif
> > >
> > > I am pretty sure I have already objected to this. Memory allocations in
> > > the oom path are simply no go unless there is absolutely no other way
> > > around that. In this case the buffer could be preallocated.
> >
> > Good point. We will change this to a smaller buffer allocated on the
> > stack and will print records one-by-one. Thanks!
>
> __show_mem could be called with a very deep call chains. A single
> pre-allocated buffer should just do ok.

Ack. Will do.

>
> --
> Michal Hocko
> SUSE Labs
Kent Overstreet Feb. 15, 2024, 6:29 p.m. UTC | #9
On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> >
> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > [...]
> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > >  #endif
> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > +     {
> > > > > +             struct seq_buf s;
> > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > +
> > > > > +             if (buf) {
> > > > > +                     printk("Memory allocations:\n");
> > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > +                     printk("%s", buf);
> > > > > +                     kfree(buf);
> > > > > +             }
> > > > > +     }
> > > > > +#endif
> > > >
> > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > the oom path are simply no go unless there is absolutely no other way
> > > > around that. In this case the buffer could be preallocated.
> > >
> > > Good point. We will change this to a smaller buffer allocated on the
> > > stack and will print records one-by-one. Thanks!
> >
> > __show_mem could be called with a very deep call chains. A single
> > pre-allocated buffer should just do ok.
> 
> Ack. Will do.

No, we're not going to permanently burn 4k here.

It's completely fine if the allocation fails, there's nothing "unsafe"
about doing a GFP_ATOMIC allocation here.
Suren Baghdasaryan Feb. 15, 2024, 6:33 p.m. UTC | #10
On Thu, Feb 15, 2024 at 10:29 AM Kent Overstreet
<kent.overstreet@linux.dev> wrote:
>
> On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > > [...]
> > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > > >  #endif
> > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > > +     {
> > > > > > +             struct seq_buf s;
> > > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > > +
> > > > > > +             if (buf) {
> > > > > > +                     printk("Memory allocations:\n");
> > > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > > +                     printk("%s", buf);
> > > > > > +                     kfree(buf);
> > > > > > +             }
> > > > > > +     }
> > > > > > +#endif
> > > > >
> > > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > > the oom path are simply no go unless there is absolutely no other way
> > > > > around that. In this case the buffer could be preallocated.
> > > >
> > > > Good point. We will change this to a smaller buffer allocated on the
> > > > stack and will print records one-by-one. Thanks!
> > >
> > > __show_mem could be called with a very deep call chains. A single
> > > pre-allocated buffer should just do ok.
> >
> > Ack. Will do.
>
> No, we're not going to permanently burn 4k here.

We don't need 4K here. Just enough to store one line and then print
these 10 highest allocations one line at a time. This way we can also
change that 10 to any higher number we like without any side effects.

>
> It's completely fine if the allocation fails, there's nothing "unsafe"
> about doing a GFP_ATOMIC allocation here.
Kent Overstreet Feb. 15, 2024, 6:38 p.m. UTC | #11
On Thu, Feb 15, 2024 at 10:33:53AM -0800, Suren Baghdasaryan wrote:
> On Thu, Feb 15, 2024 at 10:29 AM Kent Overstreet
> <kent.overstreet@linux.dev> wrote:
> >
> > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > > >
> > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > > > [...]
> > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > > > >  #endif
> > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > > > +     {
> > > > > > > +             struct seq_buf s;
> > > > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > > > +
> > > > > > > +             if (buf) {
> > > > > > > +                     printk("Memory allocations:\n");
> > > > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > > > +                     printk("%s", buf);
> > > > > > > +                     kfree(buf);
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +#endif
> > > > > >
> > > > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > > > the oom path are simply no go unless there is absolutely no other way
> > > > > > around that. In this case the buffer could be preallocated.
> > > > >
> > > > > Good point. We will change this to a smaller buffer allocated on the
> > > > > stack and will print records one-by-one. Thanks!
> > > >
> > > > __show_mem could be called with a very deep call chains. A single
> > > > pre-allocated buffer should just do ok.
> > >
> > > Ack. Will do.
> >
> > No, we're not going to permanently burn 4k here.
> 
> We don't need 4K here. Just enough to store one line and then print
> these 10 highest allocations one line at a time. This way we can also
> change that 10 to any higher number we like without any side effects.

There's no reason to make the change at all. If Michal thinks there's
something "dangerous" about allocating a buffer here, he needs to able
to explain what it is.
Michal Hocko Feb. 15, 2024, 6:41 p.m. UTC | #12
On Thu 15-02-24 13:29:40, Kent Overstreet wrote:
> On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > >
> > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > >
> > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > > [...]
> > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > > >  #endif
> > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > > +     {
> > > > > > +             struct seq_buf s;
> > > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > > +
> > > > > > +             if (buf) {
> > > > > > +                     printk("Memory allocations:\n");
> > > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > > +                     printk("%s", buf);
> > > > > > +                     kfree(buf);
> > > > > > +             }
> > > > > > +     }
> > > > > > +#endif
> > > > >
> > > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > > the oom path are simply no go unless there is absolutely no other way
> > > > > around that. In this case the buffer could be preallocated.
> > > >
> > > > Good point. We will change this to a smaller buffer allocated on the
> > > > stack and will print records one-by-one. Thanks!
> > >
> > > __show_mem could be called with a very deep call chains. A single
> > > pre-allocated buffer should just do ok.
> > 
> > Ack. Will do.
> 
> No, we're not going to permanently burn 4k here.
> 
> It's completely fine if the allocation fails, there's nothing "unsafe"
> about doing a GFP_ATOMIC allocation here.

Nobody is talking about safety. This is just a wrong thing to do when
you are likely under OOM situation. This is a situation when you
GFP_ATOMIC allocation is _likely_ to fail. Yes, yes you will get some
additional memory reservers head room, but you shouldn't rely on that
because that will make the output unreliable. Not something you want in
situation when you really want to know that information.

More over you do not need to preallocate a full page.
Suren Baghdasaryan Feb. 15, 2024, 6:49 p.m. UTC | #13
On Thu, Feb 15, 2024 at 10:41 AM Michal Hocko <mhocko@suse.com> wrote:
>
> On Thu 15-02-24 13:29:40, Kent Overstreet wrote:
> > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > > On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > > >
> > > > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > > > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > > > > >
> > > > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > > > > > [...]
> > > > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > > > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > > > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > > > > > >  #endif
> > > > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > > > > +     {
> > > > > > > +             struct seq_buf s;
> > > > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > > > > > > +
> > > > > > > +             if (buf) {
> > > > > > > +                     printk("Memory allocations:\n");
> > > > > > > +                     seq_buf_init(&s, buf, 4096);
> > > > > > > +                     alloc_tags_show_mem_report(&s);
> > > > > > > +                     printk("%s", buf);
> > > > > > > +                     kfree(buf);
> > > > > > > +             }
> > > > > > > +     }
> > > > > > > +#endif
> > > > > >
> > > > > > I am pretty sure I have already objected to this. Memory allocations in
> > > > > > the oom path are simply no go unless there is absolutely no other way
> > > > > > around that. In this case the buffer could be preallocated.
> > > > >
> > > > > Good point. We will change this to a smaller buffer allocated on the
> > > > > stack and will print records one-by-one. Thanks!
> > > >
> > > > __show_mem could be called with a very deep call chains. A single
> > > > pre-allocated buffer should just do ok.
> > >
> > > Ack. Will do.
> >
> > No, we're not going to permanently burn 4k here.
> >
> > It's completely fine if the allocation fails, there's nothing "unsafe"
> > about doing a GFP_ATOMIC allocation here.
>
> Nobody is talking about safety. This is just a wrong thing to do when
> you are likely under OOM situation. This is a situation when you
> GFP_ATOMIC allocation is _likely_ to fail. Yes, yes you will get some
> additional memory reservers head room, but you shouldn't rely on that
> because that will make the output unreliable. Not something you want in
> situation when you really want to know that information.
>
> More over you do not need to preallocate a full page.

Folks, please stop arguing about it. We have more important things to
do. I'll fix it to use a small preallocated buffer.

>
> --
> Michal Hocko
> SUSE Labs
Vlastimil Babka Feb. 15, 2024, 8:22 p.m. UTC | #14
On 2/15/24 19:29, Kent Overstreet wrote:
> On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
>> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
>> >
>> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
>> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
>> > > >
>> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
>> > > > [...]
>> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
>> > > > >  #ifdef CONFIG_MEMORY_FAILURE
>> > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
>> > > > >  #endif
>> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
>> > > > > +     {
>> > > > > +             struct seq_buf s;
>> > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
>> > > > > +
>> > > > > +             if (buf) {
>> > > > > +                     printk("Memory allocations:\n");
>> > > > > +                     seq_buf_init(&s, buf, 4096);
>> > > > > +                     alloc_tags_show_mem_report(&s);
>> > > > > +                     printk("%s", buf);
>> > > > > +                     kfree(buf);
>> > > > > +             }
>> > > > > +     }
>> > > > > +#endif
>> > > >
>> > > > I am pretty sure I have already objected to this. Memory allocations in
>> > > > the oom path are simply no go unless there is absolutely no other way
>> > > > around that. In this case the buffer could be preallocated.
>> > >
>> > > Good point. We will change this to a smaller buffer allocated on the
>> > > stack and will print records one-by-one. Thanks!
>> >
>> > __show_mem could be called with a very deep call chains. A single
>> > pre-allocated buffer should just do ok.
>> 
>> Ack. Will do.
> 
> No, we're not going to permanently burn 4k here.
> 
> It's completely fine if the allocation fails, there's nothing "unsafe"
> about doing a GFP_ATOMIC allocation here.

Well, I think without __GFP_NOWARN it will cause a warning and thus
recursion into __show_mem(), potentially infinite? Which is of course
trivial to fix, but I'd myself rather sacrifice a bit of memory to get this
potentially very useful output, if I enabled the profiling. The necessary
memory overhead of page_ext and slabobj_ext makes the printing buffer
overhead negligible in comparison?
Kent Overstreet Feb. 15, 2024, 8:33 p.m. UTC | #15
On Thu, Feb 15, 2024 at 09:22:07PM +0100, Vlastimil Babka wrote:
> On 2/15/24 19:29, Kent Overstreet wrote:
> > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> >> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> >> >
> >> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> >> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> >> > > >
> >> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> >> > > > [...]
> >> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> >> > > > >  #ifdef CONFIG_MEMORY_FAILURE
> >> > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> >> > > > >  #endif
> >> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> >> > > > > +     {
> >> > > > > +             struct seq_buf s;
> >> > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> >> > > > > +
> >> > > > > +             if (buf) {
> >> > > > > +                     printk("Memory allocations:\n");
> >> > > > > +                     seq_buf_init(&s, buf, 4096);
> >> > > > > +                     alloc_tags_show_mem_report(&s);
> >> > > > > +                     printk("%s", buf);
> >> > > > > +                     kfree(buf);
> >> > > > > +             }
> >> > > > > +     }
> >> > > > > +#endif
> >> > > >
> >> > > > I am pretty sure I have already objected to this. Memory allocations in
> >> > > > the oom path are simply no go unless there is absolutely no other way
> >> > > > around that. In this case the buffer could be preallocated.
> >> > >
> >> > > Good point. We will change this to a smaller buffer allocated on the
> >> > > stack and will print records one-by-one. Thanks!
> >> >
> >> > __show_mem could be called with a very deep call chains. A single
> >> > pre-allocated buffer should just do ok.
> >> 
> >> Ack. Will do.
> > 
> > No, we're not going to permanently burn 4k here.
> > 
> > It's completely fine if the allocation fails, there's nothing "unsafe"
> > about doing a GFP_ATOMIC allocation here.
> 
> Well, I think without __GFP_NOWARN it will cause a warning and thus
> recursion into __show_mem(), potentially infinite? Which is of course
> trivial to fix, but I'd myself rather sacrifice a bit of memory to get this
> potentially very useful output, if I enabled the profiling. The necessary
> memory overhead of page_ext and slabobj_ext makes the printing buffer
> overhead negligible in comparison?

__GFP_NOWARN is a good point, we should have that.

But - and correct me if I'm wrong here - doesn't an OOM kick in well
before GFP_ATOMIC 4k allocations are failing? I'd expect the system to
be well and truly hosed at that point.

If we want this report to be 100% reliable, then yes the preallocated
buffer makes sense - but I don't think 100% makes sense here; I think we
can accept ~99% and give back that 4k.
Michal Hocko Feb. 15, 2024, 9:54 p.m. UTC | #16
On Thu 15-02-24 15:33:30, Kent Overstreet wrote:
> On Thu, Feb 15, 2024 at 09:22:07PM +0100, Vlastimil Babka wrote:
> > On 2/15/24 19:29, Kent Overstreet wrote:
> > > On Thu, Feb 15, 2024 at 08:47:59AM -0800, Suren Baghdasaryan wrote:
> > >> On Thu, Feb 15, 2024 at 8:45 AM Michal Hocko <mhocko@suse.com> wrote:
> > >> >
> > >> > On Thu 15-02-24 06:58:42, Suren Baghdasaryan wrote:
> > >> > > On Thu, Feb 15, 2024 at 1:22 AM Michal Hocko <mhocko@suse.com> wrote:
> > >> > > >
> > >> > > > On Mon 12-02-24 13:39:17, Suren Baghdasaryan wrote:
> > >> > > > [...]
> > >> > > > > @@ -423,4 +424,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
> > >> > > > >  #ifdef CONFIG_MEMORY_FAILURE
> > >> > > > >       printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
> > >> > > > >  #endif
> > >> > > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > >> > > > > +     {
> > >> > > > > +             struct seq_buf s;
> > >> > > > > +             char *buf = kmalloc(4096, GFP_ATOMIC);
> > >> > > > > +
> > >> > > > > +             if (buf) {
> > >> > > > > +                     printk("Memory allocations:\n");
> > >> > > > > +                     seq_buf_init(&s, buf, 4096);
> > >> > > > > +                     alloc_tags_show_mem_report(&s);
> > >> > > > > +                     printk("%s", buf);
> > >> > > > > +                     kfree(buf);
> > >> > > > > +             }
> > >> > > > > +     }
> > >> > > > > +#endif
> > >> > > >
> > >> > > > I am pretty sure I have already objected to this. Memory allocations in
> > >> > > > the oom path are simply no go unless there is absolutely no other way
> > >> > > > around that. In this case the buffer could be preallocated.
> > >> > >
> > >> > > Good point. We will change this to a smaller buffer allocated on the
> > >> > > stack and will print records one-by-one. Thanks!
> > >> >
> > >> > __show_mem could be called with a very deep call chains. A single
> > >> > pre-allocated buffer should just do ok.
> > >> 
> > >> Ack. Will do.
> > > 
> > > No, we're not going to permanently burn 4k here.
> > > 
> > > It's completely fine if the allocation fails, there's nothing "unsafe"
> > > about doing a GFP_ATOMIC allocation here.
> > 
> > Well, I think without __GFP_NOWARN it will cause a warning and thus
> > recursion into __show_mem(), potentially infinite? Which is of course
> > trivial to fix, but I'd myself rather sacrifice a bit of memory to get this
> > potentially very useful output, if I enabled the profiling. The necessary
> > memory overhead of page_ext and slabobj_ext makes the printing buffer
> > overhead negligible in comparison?
> 
> __GFP_NOWARN is a good point, we should have that.
> 
> But - and correct me if I'm wrong here - doesn't an OOM kick in well
> before GFP_ATOMIC 4k allocations are failing?

Not really, GFP_ATOMIC users can compete with reclaimers and consume
those reserves.

> I'd expect the system to
> be well and truly hosed at that point.

It is OOMed...
 
> If we want this report to be 100% reliable, then yes the preallocated
> buffer makes sense - but I don't think 100% makes sense here; I think we
> can accept ~99% and give back that 4k.

Think about that from the memory reserves consumers. The atomic reserve
is a scarse resource and now you want to use it for debugging purposes
for which you could have preallocated.
Kent Overstreet Feb. 15, 2024, 10:54 p.m. UTC | #17
On Thu, Feb 15, 2024 at 10:54:53PM +0100, Michal Hocko wrote:
> On Thu 15-02-24 15:33:30, Kent Overstreet wrote:
> > If we want this report to be 100% reliable, then yes the preallocated
> > buffer makes sense - but I don't think 100% makes sense here; I think we
> > can accept ~99% and give back that 4k.
> 
> Think about that from the memory reserves consumers. The atomic reserve
> is a scarse resource and now you want to use it for debugging purposes
> for which you could have preallocated.

_Memory_ is a finite resource that we shouldn't be using unnecessarily. 

We don't need this for the entire time we're under memory pressure; just
the short duration it takes to generate the report, then it's back
available for other users.

You would have us dedicate 4k, from system bootup, that can never be
used by other users.

Again: this makes no sense. The whole point of having watermarks and
shared reserves is so that every codepath doesn't have to have its own
dedicated, private reserve, so that we can make better use of a shared
finite resource.
Steven Rostedt Feb. 15, 2024, 11:07 p.m. UTC | #18
On Thu, 15 Feb 2024 15:33:30 -0500
Kent Overstreet <kent.overstreet@linux.dev> wrote:

> > Well, I think without __GFP_NOWARN it will cause a warning and thus
> > recursion into __show_mem(), potentially infinite? Which is of course
> > trivial to fix, but I'd myself rather sacrifice a bit of memory to get
> > this potentially very useful output, if I enabled the profiling. The
> > necessary memory overhead of page_ext and slabobj_ext makes the
> > printing buffer overhead negligible in comparison?  
> 
> __GFP_NOWARN is a good point, we should have that.
> 
> But - and correct me if I'm wrong here - doesn't an OOM kick in well
> before GFP_ATOMIC 4k allocations are failing? I'd expect the system to
> be well and truly hosed at that point.
> 
> If we want this report to be 100% reliable, then yes the preallocated
> buffer makes sense - but I don't think 100% makes sense here; I think we
> can accept ~99% and give back that 4k.

I just compiled v6.8-rc4 vanilla (with a fedora localmodconfig build) and
saved it off (vmlinux.orig), then I compiled with the following:

Applied the patches but did not enable anything:	vmlinux.memtag-off
Enabled MEM_ALLOC_PROFILING:				vmlinux.memtag
Enabled MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT:		vmlinux.memtag-default-on
Enabled MEM_ALLOC_PROFILING_DEBUG:			vmlinux.memtag-debug

And here's what I got:

   text         data            bss     dec             hex filename
29161847        18352730        5619716 53134293        32ac3d5 vmlinux.orig
29162286        18382638        5595140 53140064        32ada60 vmlinux.memtag-off		(+5771)
29230868        18887662        5275652 53394182        32ebb06 vmlinux.memtag			(+259889)
29230746        18887662        5275652 53394060        32eba8c vmlinux.memtag-default-on	(+259767) dropped?
29276214        18946374        5177348 53399936        32ed180 vmlinux.memtag-debug		(+265643)

Just adding the patches increases the size by 5k. But the rest shows an
increase of 259k, and you are worried about 4k (and possibly less?)???

-- Steve
Steven Rostedt Feb. 15, 2024, 11:16 p.m. UTC | #19
On Thu, 15 Feb 2024 18:07:42 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

>    text         data            bss     dec             hex filename
> 29161847        18352730        5619716 53134293        32ac3d5 vmlinux.orig
> 29162286        18382638        5595140 53140064        32ada60 vmlinux.memtag-off		(+5771)
> 29230868        18887662        5275652 53394182        32ebb06 vmlinux.memtag			(+259889)
> 29230746        18887662        5275652 53394060        32eba8c vmlinux.memtag-default-on	(+259767) dropped?
> 29276214        18946374        5177348 53399936        32ed180 vmlinux.memtag-debug		(+265643)

If you plan on running this in production, and this increases the size of
the text by 68k, have you measured the I$ pressure that this may induce?
That is, what is the full overhead of having this enabled, as it could
cause more instruction cache misses?

I wonder if there has been measurements of it off. That is, having this
configured in but default off still increases the text size by 68k. That
can't be good on the instruction cache.

-- Steve
Dave Hansen Feb. 15, 2024, 11:19 p.m. UTC | #20
On 2/15/24 15:07, Steven Rostedt wrote:
> Just adding the patches increases the size by 5k. But the rest shows an
> increase of 259k, and you are worried about 4k (and possibly less?)???

Doesn't the new page_ext thingy add a pointer per 'struct page', or
~0.2% of RAM, or ~32MB on a 16GB laptop?  I, too, am confused why 4k is
even remotely an issue.
Steven Rostedt Feb. 15, 2024, 11:27 p.m. UTC | #21
On Thu, 15 Feb 2024 18:16:48 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Thu, 15 Feb 2024 18:07:42 -0500
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> >    text         data            bss     dec             hex filename
> > 29161847        18352730        5619716 53134293        32ac3d5 vmlinux.orig
> > 29162286        18382638        5595140 53140064        32ada60 vmlinux.memtag-off		(+5771)
> > 29230868        18887662        5275652 53394182        32ebb06 vmlinux.memtag			(+259889)
> > 29230746        18887662        5275652 53394060        32eba8c vmlinux.memtag-default-on	(+259767) dropped?
> > 29276214        18946374        5177348 53399936        32ed180 vmlinux.memtag-debug		(+265643)  
> 
> If you plan on running this in production, and this increases the size of
> the text by 68k, have you measured the I$ pressure that this may induce?
> That is, what is the full overhead of having this enabled, as it could
> cause more instruction cache misses?
> 
> I wonder if there has been measurements of it off. That is, having this
> configured in but default off still increases the text size by 68k. That
> can't be good on the instruction cache.
> 

I should have read the cover letter ;-)  (someone pointed me to that on IRC):

> Performance overhead:
> To evaluate performance we implemented an in-kernel test executing
> multiple get_free_page/free_page and kmalloc/kfree calls with allocation
> sizes growing from 8 to 240 bytes with CPU frequency set to max and CPU
> affinity set to a specific CPU to minimize the noise. Below are results
> from running the test on Ubuntu 22.04.2 LTS with 6.8.0-rc1 kernel on
> 56 core Intel Xeon:

These are micro benchmarks, were any larger benchmarks taken? As
microbenchmarks do not always show I$ issues (because the benchmark itself
will warm up the cache). The cache issue could slow down tasks at a bigger
picture, as it can cause more cache misses.

Running other benchmarks under perf and recording the cache misses between
the different configs would be a good picture to show.

> 
>                         kmalloc                 pgalloc
> (1 baseline)            6.764s                  16.902s
> (2 default disabled)    6.793s (+0.43%)         17.007s (+0.62%)
> (3 default enabled)     7.197s (+6.40%)         23.666s (+40.02%)
> (4 runtime enabled)     7.405s (+9.48%)         23.901s (+41.41%)
> (5 memcg)               13.388s (+97.94%)       48.460s (+186.71%)


> 
> Memory overhead:
> Kernel size:
> 
>    text           data        bss         dec         diff
> (1) 26515311	      18890222    17018880    62424413
> (2) 26524728	      19423818    16740352    62688898    264485
> (3) 26524724	      19423818    16740352    62688894    264481
> (4) 26524728	      19423818    16740352    62688898    264485
> (5) 26541782	      18964374    16957440    62463596    39183

Similar to my builds.


> 
> Memory consumption on a 56 core Intel CPU with 125GB of memory:
> Code tags:           192 kB
> PageExts:         262144 kB (256MB)
> SlabExts:           9876 kB (9.6MB)
> PcpuExts:            512 kB (0.5MB)
> 
> Total overhead is 0.2% of total memory.


All this, and we are still worried about 4k for useful debugging :-/

-- Steve
Kent Overstreet Feb. 15, 2024, 11:51 p.m. UTC | #22
On Thu, Feb 15, 2024 at 06:07:42PM -0500, Steven Rostedt wrote:
> On Thu, 15 Feb 2024 15:33:30 -0500
> Kent Overstreet <kent.overstreet@linux.dev> wrote:
> 
> > > Well, I think without __GFP_NOWARN it will cause a warning and thus
> > > recursion into __show_mem(), potentially infinite? Which is of course
> > > trivial to fix, but I'd myself rather sacrifice a bit of memory to get
> > > this potentially very useful output, if I enabled the profiling. The
> > > necessary memory overhead of page_ext and slabobj_ext makes the
> > > printing buffer overhead negligible in comparison?  
> > 
> > __GFP_NOWARN is a good point, we should have that.
> > 
> > But - and correct me if I'm wrong here - doesn't an OOM kick in well
> > before GFP_ATOMIC 4k allocations are failing? I'd expect the system to
> > be well and truly hosed at that point.
> > 
> > If we want this report to be 100% reliable, then yes the preallocated
> > buffer makes sense - but I don't think 100% makes sense here; I think we
> > can accept ~99% and give back that 4k.
> 
> I just compiled v6.8-rc4 vanilla (with a fedora localmodconfig build) and
> saved it off (vmlinux.orig), then I compiled with the following:
> 
> Applied the patches but did not enable anything:	vmlinux.memtag-off
> Enabled MEM_ALLOC_PROFILING:				vmlinux.memtag
> Enabled MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT:		vmlinux.memtag-default-on
> Enabled MEM_ALLOC_PROFILING_DEBUG:			vmlinux.memtag-debug
> 
> And here's what I got:
> 
>    text         data            bss     dec             hex filename
> 29161847        18352730        5619716 53134293        32ac3d5 vmlinux.orig
> 29162286        18382638        5595140 53140064        32ada60 vmlinux.memtag-off		(+5771)
> 29230868        18887662        5275652 53394182        32ebb06 vmlinux.memtag			(+259889)
> 29230746        18887662        5275652 53394060        32eba8c vmlinux.memtag-default-on	(+259767) dropped?
> 29276214        18946374        5177348 53399936        32ed180 vmlinux.memtag-debug		(+265643)
> 
> Just adding the patches increases the size by 5k. But the rest shows an
> increase of 259k, and you are worried about 4k (and possibly less?)???

Most of that is data (505024), not text (68582, or 66k).

The data is mostly the alloc tags themselves (one per allocation
callsite, and you compiled the entire kernel), so that's expected.

Of the text, a lot of that is going to be slowpath stuff - module load
and unload hooks, formatt and printing the output, other assorted bits.

Then there's Allocation and deallocating obj extensions vectors - not
slowpath but not super fast path, not every allocation.

The fastpath instruction count overhead is pretty small
 - actually doing the accounting - the core of slub.c, page_alloc.c,
   percpu.c
 - setting/restoring the alloc tag: this is overhead we add to every
   allocation callsite, so it's the most relevant - but it's just a few
   instructions.

So that's the breakdown. Definitely not zero overhead, but that fixed
memory overhead (and additionally, the percpu counters) is the price we
pay for very low runtime CPU overhead.
Kent Overstreet Feb. 15, 2024, 11:54 p.m. UTC | #23
On Thu, Feb 15, 2024 at 03:19:33PM -0800, Dave Hansen wrote:
> On 2/15/24 15:07, Steven Rostedt wrote:
> > Just adding the patches increases the size by 5k. But the rest shows an
> > increase of 259k, and you are worried about 4k (and possibly less?)???
> 
> Doesn't the new page_ext thingy add a pointer per 'struct page', or
> ~0.2% of RAM, or ~32MB on a 16GB laptop?  I, too, am confused why 4k is
> even remotely an issue.

page_ext adds a separate per-page array; it itself does not add a
pointer to strugt page, it's an array lookup that uses the page pfn.

We do add a pointer to page_ext, and that's (for now) unavoidable
overhead - but we'll be looking at referencing code tags by index, and
if that works out we'll be able to kill the page_ext dependency and just
store the alloc tag index in page bits.
Kent Overstreet Feb. 15, 2024, 11:56 p.m. UTC | #24
On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote:
> All this, and we are still worried about 4k for useful debugging :-/

Every additional 4k still needs justification. And whether we burn a
reserve on this will have no observable effect on user output in
remotely normal situations; if this allocation ever fails, we've already
been in an OOM situation for awhile and we've already printed out this
report many times, with less memory pressure where the allocation would
have succeeded.
Steven Rostedt Feb. 16, 2024, 12:21 a.m. UTC | #25
On Thu, 15 Feb 2024 18:51:41 -0500
Kent Overstreet <kent.overstreet@linux.dev> wrote:

> Most of that is data (505024), not text (68582, or 66k).
> 

And the 4K extra would have been data too.

> The data is mostly the alloc tags themselves (one per allocation
> callsite, and you compiled the entire kernel), so that's expected.
> 
> Of the text, a lot of that is going to be slowpath stuff - module load
> and unload hooks, formatt and printing the output, other assorted bits.
> 
> Then there's Allocation and deallocating obj extensions vectors - not
> slowpath but not super fast path, not every allocation.
> 
> The fastpath instruction count overhead is pretty small
>  - actually doing the accounting - the core of slub.c, page_alloc.c,
>    percpu.c
>  - setting/restoring the alloc tag: this is overhead we add to every
>    allocation callsite, so it's the most relevant - but it's just a few
>    instructions.
> 
> So that's the breakdown. Definitely not zero overhead, but that fixed
> memory overhead (and additionally, the percpu counters) is the price we
> pay for very low runtime CPU overhead.

But where are the benchmarks that are not micro-benchmarks. How much
overhead does this cause to those? Is it in the noise, or is it noticeable?

-- Steve
Kent Overstreet Feb. 16, 2024, 12:32 a.m. UTC | #26
On Thu, Feb 15, 2024 at 07:21:41PM -0500, Steven Rostedt wrote:
> On Thu, 15 Feb 2024 18:51:41 -0500
> Kent Overstreet <kent.overstreet@linux.dev> wrote:
> 
> > Most of that is data (505024), not text (68582, or 66k).
> > 
> 
> And the 4K extra would have been data too.

"It's not that much" isn't an argument for being wasteful.

> > The data is mostly the alloc tags themselves (one per allocation
> > callsite, and you compiled the entire kernel), so that's expected.
> > 
> > Of the text, a lot of that is going to be slowpath stuff - module load
> > and unload hooks, formatt and printing the output, other assorted bits.
> > 
> > Then there's Allocation and deallocating obj extensions vectors - not
> > slowpath but not super fast path, not every allocation.
> > 
> > The fastpath instruction count overhead is pretty small
> >  - actually doing the accounting - the core of slub.c, page_alloc.c,
> >    percpu.c
> >  - setting/restoring the alloc tag: this is overhead we add to every
> >    allocation callsite, so it's the most relevant - but it's just a few
> >    instructions.
> > 
> > So that's the breakdown. Definitely not zero overhead, but that fixed
> > memory overhead (and additionally, the percpu counters) is the price we
> > pay for very low runtime CPU overhead.
> 
> But where are the benchmarks that are not micro-benchmarks. How much
> overhead does this cause to those? Is it in the noise, or is it noticeable?

Microbenchmarks are how we magnify the effect of a change like this to
the most we'll ever see. Barring cache effects, it'll be in the noise.

Cache effects are a concern here because we're now touching task_struct
in the allocation fast path; that is where the
"compiled-in-but-turned-off" overhead comes from, because we can't add
static keys for that code without doubling the amount of icache
footprint, and I don't think that would be a great tradeoff.

So: if your code has fastpath allocations where the hot part of
task_struct isn't in cache, then this will be noticeable overhead to
you, otherwise it won't be.
Steven Rostedt Feb. 16, 2024, 12:39 a.m. UTC | #27
On Thu, 15 Feb 2024 19:32:38 -0500
Kent Overstreet <kent.overstreet@linux.dev> wrote:

> > But where are the benchmarks that are not micro-benchmarks. How much
> > overhead does this cause to those? Is it in the noise, or is it noticeable?  
> 
> Microbenchmarks are how we magnify the effect of a change like this to
> the most we'll ever see. Barring cache effects, it'll be in the noise.
> 
> Cache effects are a concern here because we're now touching task_struct
> in the allocation fast path; that is where the
> "compiled-in-but-turned-off" overhead comes from, because we can't add
> static keys for that code without doubling the amount of icache
> footprint, and I don't think that would be a great tradeoff.
> 
> So: if your code has fastpath allocations where the hot part of
> task_struct isn't in cache, then this will be noticeable overhead to
> you, otherwise it won't be.

All nice, but where are the benchmarks? This looks like it will have an
affect on cache and you can talk all you want about how it will not be an
issue, but without real world benchmarks, it's meaningless. Numbers talk.

-- Steve
Kent Overstreet Feb. 16, 2024, 12:50 a.m. UTC | #28
On Thu, Feb 15, 2024 at 07:39:15PM -0500, Steven Rostedt wrote:
> On Thu, 15 Feb 2024 19:32:38 -0500
> Kent Overstreet <kent.overstreet@linux.dev> wrote:
> 
> > > But where are the benchmarks that are not micro-benchmarks. How much
> > > overhead does this cause to those? Is it in the noise, or is it noticeable?  
> > 
> > Microbenchmarks are how we magnify the effect of a change like this to
> > the most we'll ever see. Barring cache effects, it'll be in the noise.
> > 
> > Cache effects are a concern here because we're now touching task_struct
> > in the allocation fast path; that is where the
> > "compiled-in-but-turned-off" overhead comes from, because we can't add
> > static keys for that code without doubling the amount of icache
> > footprint, and I don't think that would be a great tradeoff.
> > 
> > So: if your code has fastpath allocations where the hot part of
> > task_struct isn't in cache, then this will be noticeable overhead to
> > you, otherwise it won't be.
> 
> All nice, but where are the benchmarks? This looks like it will have an
> affect on cache and you can talk all you want about how it will not be an
> issue, but without real world benchmarks, it's meaningless. Numbers talk.

Steve, you're being demanding. We provided sufficient benchmarks to show
the overhead is low enough for production, and then I gave you a
detailed breakdown of where our overhead is and where it'll show up. I
think that's reasonable.
Suren Baghdasaryan Feb. 19, 2024, 5:17 p.m. UTC | #29
On Thu, Feb 15, 2024 at 3:56 PM Kent Overstreet
<kent.overstreet@linux.dev> wrote:
>
> On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote:
> > All this, and we are still worried about 4k for useful debugging :-/

I was planning to refactor this function to print one record at a time
with a smaller buffer but after discussing with Kent, he has plans to
reuse this function and having the report in one buffer is needed for
that.

> Every additional 4k still needs justification. And whether we burn a
> reserve on this will have no observable effect on user output in
> remotely normal situations; if this allocation ever fails, we've already
> been in an OOM situation for awhile and we've already printed out this
> report many times, with less memory pressure where the allocation would
> have succeeded.

I'm not sure this claim will always be true, specifically in the case
of low-end devices with relatively low amounts of reserves and in the
presence of a possible quick memory usage spike. We should also
consider a case when panic_on_oom is set. All we get is one OOM
report, so we get only one chance to capture this report. In any case,
I don't yet have data to prove or disprove this claim but it will be
interesting to test it with data from the field once the feature is
deployed.

For now I think with Vlastimil's __GFP_NOWARN suggestion the code
becomes safe and the only risk is to lose this report. If we get cases
with reports missing this data, we can easily change to reserved
memory.
Michal Hocko Feb. 20, 2024, 4:23 p.m. UTC | #30
On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote:
[...]
> For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> becomes safe and the only risk is to lose this report. If we get cases
> with reports missing this data, we can easily change to reserved
> memory.

This is not just about missing part of the oom report. This is annoying
but not earth shattering. Eating into very small reserves (that might be
the only usable memory while the system is struggling in OOM situation)
could cause functional problems that would be non trivial to test for.
All that for debugging purposes is just lame. If you want to reuse the code
for a different purpose then abstract it and allocate the buffer when you
can afford that and use preallocated on when in OOM situation.

We have always went extra mile to avoid potentially disruptive
operations from the oom handling code and I do not see any good reason
to diverge from that principle.
Kent Overstreet Feb. 20, 2024, 5:18 p.m. UTC | #31
On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote:
> On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote:
> [...]
> > For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> > becomes safe and the only risk is to lose this report. If we get cases
> > with reports missing this data, we can easily change to reserved
> > memory.
> 
> This is not just about missing part of the oom report. This is annoying
> but not earth shattering. Eating into very small reserves (that might be
> the only usable memory while the system is struggling in OOM situation)
> could cause functional problems that would be non trivial to test for.
> All that for debugging purposes is just lame. If you want to reuse the code
> for a different purpose then abstract it and allocate the buffer when you
> can afford that and use preallocated on when in OOM situation.
> 
> We have always went extra mile to avoid potentially disruptive
> operations from the oom handling code and I do not see any good reason
> to diverge from that principle.

Michal, I gave you the logic between dedicated reserves and system
reserves. Please stop repeating these vague what-ifs.
Michal Hocko Feb. 20, 2024, 5:24 p.m. UTC | #32
On Tue 20-02-24 12:18:49, Kent Overstreet wrote:
> On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote:
> > On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote:
> > [...]
> > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> > > becomes safe and the only risk is to lose this report. If we get cases
> > > with reports missing this data, we can easily change to reserved
> > > memory.
> > 
> > This is not just about missing part of the oom report. This is annoying
> > but not earth shattering. Eating into very small reserves (that might be
> > the only usable memory while the system is struggling in OOM situation)
> > could cause functional problems that would be non trivial to test for.
> > All that for debugging purposes is just lame. If you want to reuse the code
> > for a different purpose then abstract it and allocate the buffer when you
> > can afford that and use preallocated on when in OOM situation.
> > 
> > We have always went extra mile to avoid potentially disruptive
> > operations from the oom handling code and I do not see any good reason
> > to diverge from that principle.
> 
> Michal, I gave you the logic between dedicated reserves and system
> reserves. Please stop repeating these vague what-ifs.

Your argument makes little sense and it seems that it is impossible to
explain that to you. I gave up on discussing this further with you.

Consider NAK to any additional allocation from oom path unless you can
give very _solid_ arguments this is absolutely necessary. "It's gona be
fine and work most of the time" is not a solid argument.
Kent Overstreet Feb. 20, 2024, 5:32 p.m. UTC | #33
On Tue, Feb 20, 2024 at 06:24:41PM +0100, Michal Hocko wrote:
> On Tue 20-02-24 12:18:49, Kent Overstreet wrote:
> > On Tue, Feb 20, 2024 at 05:23:29PM +0100, Michal Hocko wrote:
> > > On Mon 19-02-24 09:17:36, Suren Baghdasaryan wrote:
> > > [...]
> > > > For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> > > > becomes safe and the only risk is to lose this report. If we get cases
> > > > with reports missing this data, we can easily change to reserved
> > > > memory.
> > > 
> > > This is not just about missing part of the oom report. This is annoying
> > > but not earth shattering. Eating into very small reserves (that might be
> > > the only usable memory while the system is struggling in OOM situation)
> > > could cause functional problems that would be non trivial to test for.
> > > All that for debugging purposes is just lame. If you want to reuse the code
> > > for a different purpose then abstract it and allocate the buffer when you
> > > can afford that and use preallocated on when in OOM situation.
> > > 
> > > We have always went extra mile to avoid potentially disruptive
> > > operations from the oom handling code and I do not see any good reason
> > > to diverge from that principle.
> > 
> > Michal, I gave you the logic between dedicated reserves and system
> > reserves. Please stop repeating these vague what-ifs.
> 
> Your argument makes little sense and it seems that it is impossible to
> explain that to you. I gave up on discussing this further with you.

It was your choice to not engage with the technical discussion. And if
you're not going to engage, repeating the same arguments that I already
responded to 10 or 20 emails later is a pretty dishonest way to argue.

You've been doing this kind of grandstanding throughout the entire
discussion across every revision of the patchset.

Knock it off.
Vlastimil Babka Feb. 20, 2024, 6:27 p.m. UTC | #34
On 2/19/24 18:17, Suren Baghdasaryan wrote:
> On Thu, Feb 15, 2024 at 3:56 PM Kent Overstreet
> <kent.overstreet@linux.dev> wrote:
>>
>> On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote:
>> > All this, and we are still worried about 4k for useful debugging :-/
> 
> I was planning to refactor this function to print one record at a time
> with a smaller buffer but after discussing with Kent, he has plans to
> reuse this function and having the report in one buffer is needed for
> that.

We are printing to console, AFAICS all the code involved uses plain printk()
I think it would be way easier to have a function using printk() for this
use case than the seq_buf which is more suitable for /proc and friends. Then
all concerns about buffers would be gone. It wouldn't be that much of a code
duplication?

>> Every additional 4k still needs justification. And whether we burn a
>> reserve on this will have no observable effect on user output in
>> remotely normal situations; if this allocation ever fails, we've already
>> been in an OOM situation for awhile and we've already printed out this
>> report many times, with less memory pressure where the allocation would
>> have succeeded.
> 
> I'm not sure this claim will always be true, specifically in the case
> of low-end devices with relatively low amounts of reserves and in the

That's right, GFP_ATOMIC failures can easily happen without prior OOMs.
Consider a system where userspace allocations fill the memory as they
usually do, up to high watermark. Then a burst of packets is received and
handled by GFP_ATOMIC allocations that deplete the reserves and can't cause
OOMs (OOM is when we fail to reclaim anything, but we are allocating from a
context that can't reclaim), so the very first report would be an GFP_ATOMIC
failure and now it can't allocate that buffer for printing.

I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on
this topic.

> presence of a possible quick memory usage spike. We should also
> consider a case when panic_on_oom is set. All we get is one OOM
> report, so we get only one chance to capture this report. In any case,
> I don't yet have data to prove or disprove this claim but it will be
> interesting to test it with data from the field once the feature is
> deployed.
> 
> For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> becomes safe and the only risk is to lose this report. If we get cases
> with reports missing this data, we can easily change to reserved
> memory.
Suren Baghdasaryan Feb. 20, 2024, 8:59 p.m. UTC | #35
On Tue, Feb 20, 2024 at 10:27 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/19/24 18:17, Suren Baghdasaryan wrote:
> > On Thu, Feb 15, 2024 at 3:56 PM Kent Overstreet
> > <kent.overstreet@linux.dev> wrote:
> >>
> >> On Thu, Feb 15, 2024 at 06:27:29PM -0500, Steven Rostedt wrote:
> >> > All this, and we are still worried about 4k for useful debugging :-/
> >
> > I was planning to refactor this function to print one record at a time
> > with a smaller buffer but after discussing with Kent, he has plans to
> > reuse this function and having the report in one buffer is needed for
> > that.
>
> We are printing to console, AFAICS all the code involved uses plain printk()
> I think it would be way easier to have a function using printk() for this
> use case than the seq_buf which is more suitable for /proc and friends. Then
> all concerns about buffers would be gone. It wouldn't be that much of a code
> duplication?

Ok, after discussing this with Kent, I'll change this patch to provide
a function returning N top consumers (the array and N will be provided
by the caller) and then we can print one record at a time with much
less memory needed. That should address reusability concerns, will use
memory more efficiently and will allow for more flexibility (more/less
than 10 records if needed).
Thanks for the feedback, everyone!

>
> >> Every additional 4k still needs justification. And whether we burn a
> >> reserve on this will have no observable effect on user output in
> >> remotely normal situations; if this allocation ever fails, we've already
> >> been in an OOM situation for awhile and we've already printed out this
> >> report many times, with less memory pressure where the allocation would
> >> have succeeded.
> >
> > I'm not sure this claim will always be true, specifically in the case
> > of low-end devices with relatively low amounts of reserves and in the
>
> That's right, GFP_ATOMIC failures can easily happen without prior OOMs.
> Consider a system where userspace allocations fill the memory as they
> usually do, up to high watermark. Then a burst of packets is received and
> handled by GFP_ATOMIC allocations that deplete the reserves and can't cause
> OOMs (OOM is when we fail to reclaim anything, but we are allocating from a
> context that can't reclaim), so the very first report would be an GFP_ATOMIC
> failure and now it can't allocate that buffer for printing.
>
> I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on
> this topic.
>
> > presence of a possible quick memory usage spike. We should also
> > consider a case when panic_on_oom is set. All we get is one OOM
> > report, so we get only one chance to capture this report. In any case,
> > I don't yet have data to prove or disprove this claim but it will be
> > interesting to test it with data from the field once the feature is
> > deployed.
> >
> > For now I think with Vlastimil's __GFP_NOWARN suggestion the code
> > becomes safe and the only risk is to lose this report. If we get cases
> > with reports missing this data, we can easily change to reserved
> > memory.
>
Tetsuo Handa Feb. 21, 2024, 1:21 p.m. UTC | #36
On 2024/02/21 3:27, Vlastimil Babka wrote:
> I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on
> this topic.

"[PATCH v3 10/35] lib: code tagging framework" says that codetag_lock_module_list()
calls down_read() (i.e. sleeping operation), and
"[PATCH v3 31/35] lib: add memory allocations report in show_mem()" says that
__show_mem() calls alloc_tags_show_mem_report() after kmalloc(GFP_ATOMIC) (i.e.
non-sleeping operation) but alloc_tags_show_mem_report() calls down_read() via
codetag_lock_module_list() !?

If __show_mem() might be called from atomic context (e.g. kmalloc(GFP_ATOMIC)),
this will be a sleep in atomic bug.
If __show_mem() might be called while semaphore is held for write,
this will be a read-lock after write-lock deadlock bug.

Not the matter of whether to allocate buffer statically or dynamically.
Please don't hold a lock when trying to report memory usage.
Suren Baghdasaryan Feb. 21, 2024, 6:26 p.m. UTC | #37
On Wed, Feb 21, 2024 at 5:22 AM Tetsuo Handa
<penguin-kernel@i-love.sakura.ne.jp> wrote:
>
> On 2024/02/21 3:27, Vlastimil Babka wrote:
> > I'm sure more such scenarios exist, Cc: Tetsuo who I recall was an expert on
> > this topic.
>
> "[PATCH v3 10/35] lib: code tagging framework" says that codetag_lock_module_list()
> calls down_read() (i.e. sleeping operation), and
> "[PATCH v3 31/35] lib: add memory allocations report in show_mem()" says that
> __show_mem() calls alloc_tags_show_mem_report() after kmalloc(GFP_ATOMIC) (i.e.
> non-sleeping operation) but alloc_tags_show_mem_report() calls down_read() via
> codetag_lock_module_list() !?
>
> If __show_mem() might be called from atomic context (e.g. kmalloc(GFP_ATOMIC)),
> this will be a sleep in atomic bug.
> If __show_mem() might be called while semaphore is held for write,
> this will be a read-lock after write-lock deadlock bug.
>
> Not the matter of whether to allocate buffer statically or dynamically.
> Please don't hold a lock when trying to report memory usage.

Thanks for catching this, Tetsuo! Yes, we take the read-lock here to
ensure that the list of modules is stable. I'm thinking I can replace
the down_read() with down_read_trylock() and if we fail (there is a
race with module load/unload) we will skip generating this report. The
probability of racing with module load/unload while in OOM state I
think is quite low, so skipping this report should not cause much
information loss.

>
>
diff mbox series

Patch

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index 3fe51e67e231..0a5973c4ad77 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -30,6 +30,8 @@  struct alloc_tag {
 
 #ifdef CONFIG_MEM_ALLOC_PROFILING
 
+void alloc_tags_show_mem_report(struct seq_buf *s);
+
 static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
 {
 	return container_of(ct, struct alloc_tag, ct);
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 2d5226d9262d..54312c213860 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -96,6 +96,44 @@  static const struct seq_operations allocinfo_seq_op = {
 	.show	= allocinfo_show,
 };
 
+void alloc_tags_show_mem_report(struct seq_buf *s)
+{
+	struct codetag_iterator iter;
+	struct codetag *ct;
+	struct {
+		struct codetag		*tag;
+		size_t			bytes;
+	} tags[10], n;
+	unsigned int i, nr = 0;
+
+	codetag_lock_module_list(alloc_tag_cttype, true);
+	iter = codetag_get_ct_iter(alloc_tag_cttype);
+	while ((ct = codetag_next_ct(&iter))) {
+		struct alloc_tag_counters counter = alloc_tag_read(ct_to_alloc_tag(ct));
+
+		n.tag	= ct;
+		n.bytes = counter.bytes;
+
+		for (i = 0; i < nr; i++)
+			if (n.bytes > tags[i].bytes)
+				break;
+
+		if (i < ARRAY_SIZE(tags)) {
+			nr -= nr == ARRAY_SIZE(tags);
+			memmove(&tags[i + 1],
+				&tags[i],
+				sizeof(tags[0]) * (nr - i));
+			nr++;
+			tags[i] = n;
+		}
+	}
+
+	for (i = 0; i < nr; i++)
+		alloc_tag_to_text(s, tags[i].tag);
+
+	codetag_lock_module_list(alloc_tag_cttype, false);
+}
+
 static void __init procfs_init(void)
 {
 	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
diff --git a/mm/show_mem.c b/mm/show_mem.c
index 8dcfafbd283c..d514c15ca076 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -12,6 +12,7 @@ 
 #include <linux/hugetlb.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/seq_buf.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
 
@@ -423,4 +424,18 @@  void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
 #ifdef CONFIG_MEMORY_FAILURE
 	printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages));
 #endif
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	{
+		struct seq_buf s;
+		char *buf = kmalloc(4096, GFP_ATOMIC);
+
+		if (buf) {
+			printk("Memory allocations:\n");
+			seq_buf_init(&s, buf, 4096);
+			alloc_tags_show_mem_report(&s);
+			printk("%s", buf);
+			kfree(buf);
+		}
+	}
+#endif
 }