diff mbox series

Add accumulated call counter for memory allocation profiling

Message ID 20240617153250.9079-1-00107082@163.com (mailing list archive)
State New
Headers show
Series Add accumulated call counter for memory allocation profiling | expand

Commit Message

David Wang June 17, 2024, 3:32 p.m. UTC
Accumulated call counter can be used to evaluate rate
of memory allocation via delta(counters)/delta(time).
This metrics can help analysis performance behaviours,
e.g. tuning cache size, etc.

Signed-off-by: David Wang <00107082@163.com>
---
 include/linux/alloc_tag.h | 11 +++++++----
 lib/alloc_tag.c           |  7 +++----
 2 files changed, 10 insertions(+), 8 deletions(-)

Comments

Suren Baghdasaryan June 30, 2024, 7:33 p.m. UTC | #1
On Mon, Jun 17, 2024 at 8:33 AM David Wang <00107082@163.com> wrote:
>
> Accumulated call counter can be used to evaluate rate
> of memory allocation via delta(counters)/delta(time).
> This metrics can help analysis performance behaviours,
> e.g. tuning cache size, etc.

Sorry for the delay, David.
IIUC with this counter you can identify the number of allocations ever
made from a specific code location. Could you please clarify the usage
a bit more? Is the goal to see which locations are the most active and
the rate at which allocations are made there? How will that
information be used?
I'm a bit cautious here because each counter will take more space and
use some additional cpu cycles.
Thanks,
Suren.

>
> Signed-off-by: David Wang <00107082@163.com>
> ---
>  include/linux/alloc_tag.h | 11 +++++++----
>  lib/alloc_tag.c           |  7 +++----
>  2 files changed, 10 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index abd24016a900..62734244c0b9 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -18,6 +18,7 @@
>  struct alloc_tag_counters {
>         u64 bytes;
>         u64 calls;
> +       u64 accu_calls;
>  };
>
>  /*
> @@ -102,14 +103,15 @@ static inline bool mem_alloc_profiling_enabled(void)
>
>  static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
>  {
> -       struct alloc_tag_counters v = { 0, 0 };
> +       struct alloc_tag_counters v = { 0, 0, 0 };
>         struct alloc_tag_counters *counter;
>         int cpu;
>
>         for_each_possible_cpu(cpu) {
> -               counter = per_cpu_ptr(tag->counters, cpu);
> -               v.bytes += counter->bytes;
> -               v.calls += counter->calls;
> +               counter         = per_cpu_ptr(tag->counters, cpu);
> +               v.bytes         += counter->bytes;
> +               v.calls         += counter->calls;
> +               v.accu_calls    += counter->accu_calls;
>         }
>
>         return v;
> @@ -145,6 +147,7 @@ static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
>          * counter because when we free each part the counter will be decremented.
>          */
>         this_cpu_inc(tag->counters->calls);
> +       this_cpu_inc(tag->counters->accu_calls);
>  }
>
>  static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> index 11ed973ac359..c4059362d828 100644
> --- a/lib/alloc_tag.c
> +++ b/lib/alloc_tag.c
> @@ -66,8 +66,8 @@ static void allocinfo_stop(struct seq_file *m, void *arg)
>  static void print_allocinfo_header(struct seq_buf *buf)
>  {
>         /* Output format version, so we can change it. */
> -       seq_buf_printf(buf, "allocinfo - version: 1.0\n");
> -       seq_buf_printf(buf, "#     <size>  <calls> <tag info>\n");
> +       seq_buf_printf(buf, "allocinfo - version: 1.1\n");
> +       seq_buf_printf(buf, "#     <size>  <calls> <tag info> <accumulated calls>\n");
>  }
>
>  static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
> @@ -78,8 +78,7 @@ static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
>
>         seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
>         codetag_to_text(out, ct);
> -       seq_buf_putc(out, ' ');
> -       seq_buf_putc(out, '\n');
> +       seq_buf_printf(out, " %llu\n", counter.accu_calls);
>  }
>
>  static int allocinfo_show(struct seq_file *m, void *arg)
> --
> 2.39.2
>
Kent Overstreet June 30, 2024, 7:52 p.m. UTC | #2
On Sun, Jun 30, 2024 at 12:33:14PM GMT, Suren Baghdasaryan wrote:
> On Mon, Jun 17, 2024 at 8:33 AM David Wang <00107082@163.com> wrote:
> >
> > Accumulated call counter can be used to evaluate rate
> > of memory allocation via delta(counters)/delta(time).
> > This metrics can help analysis performance behaviours,
> > e.g. tuning cache size, etc.
> 
> Sorry for the delay, David.
> IIUC with this counter you can identify the number of allocations ever
> made from a specific code location. Could you please clarify the usage
> a bit more? Is the goal to see which locations are the most active and
> the rate at which allocations are made there? How will that
> information be used?
> I'm a bit cautious here because each counter will take more space and
> use some additional cpu cycles.
> Thanks,
> Suren.

Maybe behind another kconfig option?
David Wang July 1, 2024, 2:23 a.m. UTC | #3
HI Suren, 

At 2024-07-01 03:33:14, "Suren Baghdasaryan" <surenb@google.com> wrote:
>On Mon, Jun 17, 2024 at 8:33 AM David Wang <00107082@163.com> wrote:
>>
>> Accumulated call counter can be used to evaluate rate
>> of memory allocation via delta(counters)/delta(time).
>> This metrics can help analysis performance behaviours,
>> e.g. tuning cache size, etc.
>
>Sorry for the delay, David.
>IIUC with this counter you can identify the number of allocations ever
>made from a specific code location. Could you please clarify the usage
>a bit more? Is the goal to see which locations are the most active and
>the rate at which allocations are made there? How will that
>information be used?
 
Cumulative counters can be sampled with timestamp,  say at T1, a monitoring tool got a sample value V1,
then after sampling interval, at T2,  got a sample value V2. Then the average rate of allocation can be evaluated
via (V2-V1)/(T2-T1). (The accuracy depends on sampling interval)

This information "may" help identify where the memory allocation is unnecessary frequent,  
and  gain some  better performance by making less memory allocation .
The performance "gain" is just a guess, I do not have a valid example.



>I'm a bit cautious here because each counter will take more space and
>use some additional cpu cycles.
>Thanks,
>Suren.
>



Thanks~!
David
Kent Overstreet July 1, 2024, 9:58 p.m. UTC | #4
On Mon, Jul 01, 2024 at 10:23:32AM GMT, David Wang wrote:
> HI Suren, 
> 
> At 2024-07-01 03:33:14, "Suren Baghdasaryan" <surenb@google.com> wrote:
> >On Mon, Jun 17, 2024 at 8:33 AM David Wang <00107082@163.com> wrote:
> >>
> >> Accumulated call counter can be used to evaluate rate
> >> of memory allocation via delta(counters)/delta(time).
> >> This metrics can help analysis performance behaviours,
> >> e.g. tuning cache size, etc.
> >
> >Sorry for the delay, David.
> >IIUC with this counter you can identify the number of allocations ever
> >made from a specific code location. Could you please clarify the usage
> >a bit more? Is the goal to see which locations are the most active and
> >the rate at which allocations are made there? How will that
> >information be used?
>  
> Cumulative counters can be sampled with timestamp,  say at T1, a monitoring tool got a sample value V1,
> then after sampling interval, at T2,  got a sample value V2. Then the average rate of allocation can be evaluated
> via (V2-V1)/(T2-T1). (The accuracy depends on sampling interval)
> 
> This information "may" help identify where the memory allocation is unnecessary frequent,  
> and  gain some  better performance by making less memory allocation .
> The performance "gain" is just a guess, I do not have a valid example.

Easier to just run perf...
David Wang Sept. 12, 2024, 2:27 a.m. UTC | #5
At 2024-07-02 05:58:50, "Kent Overstreet" <kent.overstreet@linux.dev> wrote:
>On Mon, Jul 01, 2024 at 10:23:32AM GMT, David Wang wrote:
>> HI Suren, 
>> 
>> At 2024-07-01 03:33:14, "Suren Baghdasaryan" <surenb@google.com> wrote:
>> >On Mon, Jun 17, 2024 at 8:33 AM David Wang <00107082@163.com> wrote:
>> >>
>> >> Accumulated call counter can be used to evaluate rate
>> >> of memory allocation via delta(counters)/delta(time).
>> >> This metrics can help analysis performance behaviours,
>> >> e.g. tuning cache size, etc.
>> >
>> >Sorry for the delay, David.
>> >IIUC with this counter you can identify the number of allocations ever
>> >made from a specific code location. Could you please clarify the usage
>> >a bit more? Is the goal to see which locations are the most active and
>> >the rate at which allocations are made there? How will that
>> >information be used?
>>  
>> Cumulative counters can be sampled with timestamp,  say at T1, a monitoring tool got a sample value V1,
>> then after sampling interval, at T2,  got a sample value V2. Then the average rate of allocation can be evaluated
>> via (V2-V1)/(T2-T1). (The accuracy depends on sampling interval)
>> 
>> This information "may" help identify where the memory allocation is unnecessary frequent,  
>> and  gain some  better performance by making less memory allocation .
>> The performance "gain" is just a guess, I do not have a valid example.
>
>Easier to just run perf...

Hi, 

To Kent:
It is strangely odd to reply to this when I was trying to debug a performance issue for bcachefs :)

Yes it is true that performance bottleneck could be identified by perf tools, but normally perf
is not continously running (well, there are some continous profiling projects out there).
And also, memory allocation normally is not the biggest bottleneck,
 its impact may not easily picked up by perf. 

Well, in the case of https://lore.kernel.org/lkml/20240906154354.61915-1-00107082@163.com/,
the memory allocation is picked up by perf tools though. 
But with this patch, it is easier to spot that memory allocations behavior are quite different:
When performance were bad, the average rate for 
"fs/bcachefs/io_write.c:113 func:__bio_alloc_page_pool" was 400k/s,
while when performance were good, rate was only less than 200/s.

(I have a sample tool collecting /proc/allocinfo, and the data is stored in prometheus,
the rate is calculated and plot via prometheus statement:
irate(mem_profiling_count_total{file=~"fs/bcachefs.*", func="__bio_alloc_page_pool"}[5m]))

Hope this could be a valid example demonstrating the usefulness of accumulative counters
of memory allocation for performance issues.


Thanks
David
Suren Baghdasaryan Sept. 12, 2024, 4:12 p.m. UTC | #6
On Wed, Sep 11, 2024 at 7:28 PM David Wang <00107082@163.com> wrote:
>
> At 2024-07-02 05:58:50, "Kent Overstreet" <kent.overstreet@linux.dev> wrote:
> >On Mon, Jul 01, 2024 at 10:23:32AM GMT, David Wang wrote:
> >> HI Suren,
> >>
> >> At 2024-07-01 03:33:14, "Suren Baghdasaryan" <surenb@google.com> wrote:
> >> >On Mon, Jun 17, 2024 at 8:33 AM David Wang <00107082@163.com> wrote:
> >> >>
> >> >> Accumulated call counter can be used to evaluate rate
> >> >> of memory allocation via delta(counters)/delta(time).
> >> >> This metrics can help analysis performance behaviours,
> >> >> e.g. tuning cache size, etc.
> >> >
> >> >Sorry for the delay, David.
> >> >IIUC with this counter you can identify the number of allocations ever
> >> >made from a specific code location. Could you please clarify the usage
> >> >a bit more? Is the goal to see which locations are the most active and
> >> >the rate at which allocations are made there? How will that
> >> >information be used?
> >>
> >> Cumulative counters can be sampled with timestamp,  say at T1, a monitoring tool got a sample value V1,
> >> then after sampling interval, at T2,  got a sample value V2. Then the average rate of allocation can be evaluated
> >> via (V2-V1)/(T2-T1). (The accuracy depends on sampling interval)
> >>
> >> This information "may" help identify where the memory allocation is unnecessary frequent,
> >> and  gain some  better performance by making less memory allocation .
> >> The performance "gain" is just a guess, I do not have a valid example.
> >
> >Easier to just run perf...
>
> Hi,
>
> To Kent:
> It is strangely odd to reply to this when I was trying to debug a performance issue for bcachefs :)
>
> Yes it is true that performance bottleneck could be identified by perf tools, but normally perf
> is not continously running (well, there are some continous profiling projects out there).
> And also, memory allocation normally is not the biggest bottleneck,
>  its impact may not easily picked up by perf.
>
> Well, in the case of https://lore.kernel.org/lkml/20240906154354.61915-1-00107082@163.com/,
> the memory allocation is picked up by perf tools though.
> But with this patch, it is easier to spot that memory allocations behavior are quite different:
> When performance were bad, the average rate for
> "fs/bcachefs/io_write.c:113 func:__bio_alloc_page_pool" was 400k/s,
> while when performance were good, rate was only less than 200/s.
>
> (I have a sample tool collecting /proc/allocinfo, and the data is stored in prometheus,
> the rate is calculated and plot via prometheus statement:
> irate(mem_profiling_count_total{file=~"fs/bcachefs.*", func="__bio_alloc_page_pool"}[5m]))
>
> Hope this could be a valid example demonstrating the usefulness of accumulative counters
> of memory allocation for performance issues.

Hi David,
I agree with Kent that this feature should be behind a kconfig flag.
We don't want to impose the overhead to the users who do not need this
feature.
Thanks,
Suren.

>
>
> Thanks
> David
>
>
>
diff mbox series

Patch

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index abd24016a900..62734244c0b9 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -18,6 +18,7 @@ 
 struct alloc_tag_counters {
 	u64 bytes;
 	u64 calls;
+	u64 accu_calls;
 };
 
 /*
@@ -102,14 +103,15 @@  static inline bool mem_alloc_profiling_enabled(void)
 
 static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
 {
-	struct alloc_tag_counters v = { 0, 0 };
+	struct alloc_tag_counters v = { 0, 0, 0 };
 	struct alloc_tag_counters *counter;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		counter = per_cpu_ptr(tag->counters, cpu);
-		v.bytes += counter->bytes;
-		v.calls += counter->calls;
+		counter		= per_cpu_ptr(tag->counters, cpu);
+		v.bytes		+= counter->bytes;
+		v.calls		+= counter->calls;
+		v.accu_calls	+= counter->accu_calls;
 	}
 
 	return v;
@@ -145,6 +147,7 @@  static inline void __alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag
 	 * counter because when we free each part the counter will be decremented.
 	 */
 	this_cpu_inc(tag->counters->calls);
+	this_cpu_inc(tag->counters->accu_calls);
 }
 
 static inline void alloc_tag_ref_set(union codetag_ref *ref, struct alloc_tag *tag)
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 11ed973ac359..c4059362d828 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -66,8 +66,8 @@  static void allocinfo_stop(struct seq_file *m, void *arg)
 static void print_allocinfo_header(struct seq_buf *buf)
 {
 	/* Output format version, so we can change it. */
-	seq_buf_printf(buf, "allocinfo - version: 1.0\n");
-	seq_buf_printf(buf, "#     <size>  <calls> <tag info>\n");
+	seq_buf_printf(buf, "allocinfo - version: 1.1\n");
+	seq_buf_printf(buf, "#     <size>  <calls> <tag info> <accumulated calls>\n");
 }
 
 static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
@@ -78,8 +78,7 @@  static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
 
 	seq_buf_printf(out, "%12lli %8llu ", bytes, counter.calls);
 	codetag_to_text(out, ct);
-	seq_buf_putc(out, ' ');
-	seq_buf_putc(out, '\n');
+	seq_buf_printf(out, " %llu\n", counter.accu_calls);
 }
 
 static int allocinfo_show(struct seq_file *m, void *arg)