diff mbox series

[v3,13/35] lib: add allocation tagging support for memory allocation profiling

Message ID 20240212213922.783301-14-surenb@google.com (mailing list archive)
State New
Headers show
Series Memory allocation profiling | expand

Commit Message

Suren Baghdasaryan Feb. 12, 2024, 9:38 p.m. UTC
Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
instrument memory allocators. It registers an "alloc_tags" codetag type
with /proc/allocinfo interface to output allocation tag information when
the feature is enabled.
CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
allocation profiling instrumentation.
Memory allocation profiling can be enabled or disabled at runtime using
/proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
profiling by default.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 Documentation/admin-guide/sysctl/vm.rst |  16 +++
 Documentation/filesystems/proc.rst      |  28 +++++
 include/asm-generic/codetag.lds.h       |  14 +++
 include/asm-generic/vmlinux.lds.h       |   3 +
 include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
 include/linux/sched.h                   |  24 ++++
 lib/Kconfig.debug                       |  25 ++++
 lib/Makefile                            |   2 +
 lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
 scripts/module.lds.S                    |   7 ++
 10 files changed, 410 insertions(+)
 create mode 100644 include/asm-generic/codetag.lds.h
 create mode 100644 include/linux/alloc_tag.h
 create mode 100644 lib/alloc_tag.c

Comments

Kees Cook Feb. 12, 2024, 10:40 p.m. UTC | #1
On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> instrument memory allocators. It registers an "alloc_tags" codetag type
> with /proc/allocinfo interface to output allocation tag information when

Please don't add anything new to the top-level /proc directory. This
should likely live in /sys.

> the feature is enabled.
> CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> allocation profiling instrumentation.
> Memory allocation profiling can be enabled or disabled at runtime using
> /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> profiling by default.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
>  Documentation/admin-guide/sysctl/vm.rst |  16 +++
>  Documentation/filesystems/proc.rst      |  28 +++++
>  include/asm-generic/codetag.lds.h       |  14 +++
>  include/asm-generic/vmlinux.lds.h       |   3 +
>  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
>  include/linux/sched.h                   |  24 ++++
>  lib/Kconfig.debug                       |  25 ++++
>  lib/Makefile                            |   2 +
>  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
>  scripts/module.lds.S                    |   7 ++
>  10 files changed, 410 insertions(+)
>  create mode 100644 include/asm-generic/codetag.lds.h
>  create mode 100644 include/linux/alloc_tag.h
>  create mode 100644 lib/alloc_tag.c
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index c59889de122b..a214719492ea 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
>  - legacy_va_layout
>  - lowmem_reserve_ratio
>  - max_map_count
> +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
>  - memory_failure_early_kill
>  - memory_failure_recovery
>  - min_free_kbytes
> @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
>  The default value is 65530.
>  
>  
> +mem_profiling
> +==============
> +
> +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> +
> +1: Enable memory profiling.
> +
> +0: Disabld memory profiling.
> +
> +Enabling memory profiling introduces a small performance overhead for all
> +memory allocations.
> +
> +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
> +
> +
>  memory_failure_early_kill:
>  ==========================
>  
> diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> index 104c6d047d9b..40d6d18308e4 100644
> --- a/Documentation/filesystems/proc.rst
> +++ b/Documentation/filesystems/proc.rst
> @@ -688,6 +688,7 @@ files are there, and which are missing.
>   ============ ===============================================================
>   File         Content
>   ============ ===============================================================
> + allocinfo    Memory allocations profiling information
>   apm          Advanced power management info
>   bootconfig   Kernel command line obtained from boot config,
>   	      and, if there were kernel parameters from the
> @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be
>  reclaimed to achieve this.
>  
>  
> +allocinfo
> +~~~~~~~
> +
> +Provides information about memory allocations at all locations in the code
> +base. Each allocation in the code is identified by its source file, line
> +number, module and the function calling the allocation. The number of bytes
> +allocated at each location is reported.
> +
> +Example output.
> +
> +::
> +
> +    > cat /proc/allocinfo
> +
> +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> +      ...
> +
> +
>  meminfo
>  ~~~~~~~
>  
> diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
> new file mode 100644
> index 000000000000..64f536b80380
> --- /dev/null
> +++ b/include/asm-generic/codetag.lds.h
> @@ -0,0 +1,14 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +#ifndef __ASM_GENERIC_CODETAG_LDS_H
> +#define __ASM_GENERIC_CODETAG_LDS_H
> +
> +#define SECTION_WITH_BOUNDARIES(_name)	\
> +	. = ALIGN(8);			\
> +	__start_##_name = .;		\
> +	KEEP(*(_name))			\
> +	__stop_##_name = .;
> +
> +#define CODETAG_SECTIONS()		\
> +	SECTION_WITH_BOUNDARIES(alloc_tags)
> +
> +#endif /* __ASM_GENERIC_CODETAG_LDS_H */
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index 5dd3a61d673d..c9997dc50c50 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -50,6 +50,8 @@
>   *               [__nosave_begin, __nosave_end] for the nosave data
>   */
>  
> +#include <asm-generic/codetag.lds.h>
> +
>  #ifndef LOAD_OFFSET
>  #define LOAD_OFFSET 0
>  #endif
> @@ -366,6 +368,7 @@
>  	. = ALIGN(8);							\
>  	BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)		\
>  	BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)				\
> +	CODETAG_SECTIONS()						\
>  	LIKELY_PROFILE()		       				\
>  	BRANCH_PROFILE()						\
>  	TRACE_PRINTKS()							\
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> new file mode 100644
> index 000000000000..cf55a149fa84
> --- /dev/null
> +++ b/include/linux/alloc_tag.h
> @@ -0,0 +1,133 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * allocation tagging
> + */
> +#ifndef _LINUX_ALLOC_TAG_H
> +#define _LINUX_ALLOC_TAG_H
> +
> +#include <linux/bug.h>
> +#include <linux/codetag.h>
> +#include <linux/container_of.h>
> +#include <linux/preempt.h>
> +#include <asm/percpu.h>
> +#include <linux/cpumask.h>
> +#include <linux/static_key.h>
> +
> +struct alloc_tag_counters {
> +	u64 bytes;
> +	u64 calls;
> +};
> +
> +/*
> + * An instance of this structure is created in a special ELF section at every
> + * allocation callsite. At runtime, the special section is treated as
> + * an array of these. Embedded codetag utilizes codetag framework.
> + */
> +struct alloc_tag {
> +	struct codetag			ct;
> +	struct alloc_tag_counters __percpu	*counters;
> +} __aligned(8);
> +
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +
> +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
> +{
> +	return container_of(ct, struct alloc_tag, ct);
> +}
> +
> +#ifdef ARCH_NEEDS_WEAK_PER_CPU
> +/*
> + * When percpu variables are required to be defined as weak, static percpu
> + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
> + */
> +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU"

Is this enforced via Kconfig as well? (Looks like only alpha and s390?)

> +#endif
> +
> +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)					\
> +	static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);	\
> +	static struct alloc_tag _alloc_tag __used __aligned(8)			\
> +	__section("alloc_tags") = {						\
> +		.ct = CODE_TAG_INIT,						\
> +		.counters = &_alloc_tag_cntr };					\
> +	struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
> +
> +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> +			mem_alloc_profiling_key);
> +
> +static inline bool mem_alloc_profiling_enabled(void)
> +{
> +	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> +				   &mem_alloc_profiling_key);
> +}
> +
> +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> +{
> +	struct alloc_tag_counters v = { 0, 0 };
> +	struct alloc_tag_counters *counter;
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		counter = per_cpu_ptr(tag->counters, cpu);
> +		v.bytes += counter->bytes;
> +		v.calls += counter->calls;
> +	}
> +
> +	return v;
> +}
> +
> +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> +{
> +	struct alloc_tag *tag;
> +
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> +#endif
> +	if (!ref || !ref->ct)
> +		return;
> +
> +	tag = ct_to_alloc_tag(ref->ct);
> +
> +	this_cpu_sub(tag->counters->bytes, bytes);
> +	this_cpu_dec(tag->counters->calls);
> +
> +	ref->ct = NULL;
> +}
> +
> +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> +{
> +	__alloc_tag_sub(ref, bytes);
> +}
> +
> +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
> +{
> +	__alloc_tag_sub(ref, bytes);
> +}
> +
> +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
> +{
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +	WARN_ONCE(ref && ref->ct,
> +		  "alloc_tag was not cleared (got tag for %s:%u)\n",\
> +		  ref->ct->filename, ref->ct->lineno);
> +
> +	WARN_ONCE(!tag, "current->alloc_tag not set");
> +#endif
> +	if (!ref || !tag)
> +		return;
> +
> +	ref->ct = &tag->ct;
> +	this_cpu_add(tag->counters->bytes, bytes);
> +	this_cpu_inc(tag->counters->calls);
> +}
> +
> +#else
> +
> +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
> +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
> +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
> +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
> +				 size_t bytes) {}
> +
> +#endif
> +
> +#endif /* _LINUX_ALLOC_TAG_H */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ffe8f618ab86..da68a10517c8 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -770,6 +770,10 @@ struct task_struct {
>  	unsigned int			flags;
>  	unsigned int			ptrace;
>  
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +	struct alloc_tag		*alloc_tag;
> +#endif

Normally scheduling is very sensitive to having anything early in
task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
area.

> +
>  #ifdef CONFIG_SMP
>  	int				on_cpu;
>  	struct __call_single_node	wake_entry;
> @@ -810,6 +814,7 @@ struct task_struct {
>  	struct task_group		*sched_task_group;
>  #endif
>  
> +
>  #ifdef CONFIG_UCLAMP_TASK
>  	/*
>  	 * Clamp values requested for a scheduling entity.
> @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
>  
>  extern void sched_set_stop_task(int cpu, struct task_struct *stop);
>  
> +#ifdef CONFIG_MEM_ALLOC_PROFILING
> +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
> +{
> +	swap(current->alloc_tag, tag);
> +	return tag;
> +}
> +
> +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
> +{
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +	WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
> +#endif
> +	current->alloc_tag = old;
> +}
> +#else
> +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
> +#define alloc_tag_restore(_tag, _old)
> +#endif
> +
>  #endif
> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 0be2d00c3696..78d258ca508f 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -972,6 +972,31 @@ config CODE_TAGGING
>  	bool
>  	select KALLSYMS
>  
> +config MEM_ALLOC_PROFILING
> +	bool "Enable memory allocation profiling"
> +	default n
> +	depends on PROC_FS
> +	depends on !DEBUG_FORCE_WEAK_PER_CPU
> +	select CODE_TAGGING
> +	help
> +	  Track allocation source code and record total allocation size
> +	  initiated at that code location. The mechanism can be used to track
> +	  memory leaks with a low performance and memory impact.
> +
> +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +	bool "Enable memory allocation profiling by default"
> +	default y
> +	depends on MEM_ALLOC_PROFILING
> +
> +config MEM_ALLOC_PROFILING_DEBUG
> +	bool "Memory allocation profiler debugging"
> +	default n
> +	depends on MEM_ALLOC_PROFILING
> +	select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +	help
> +	  Adds warnings with helpful error messages for memory allocation
> +	  profiling.
> +
>  source "lib/Kconfig.kasan"
>  source "lib/Kconfig.kfence"
>  source "lib/Kconfig.kmsan"
> diff --git a/lib/Makefile b/lib/Makefile
> index 6b48b22fdfac..859112f09bf5 100644
> --- a/lib/Makefile
> +++ b/lib/Makefile
> @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
>  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
>  
>  obj-$(CONFIG_CODE_TAGGING) += codetag.o
> +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
> +
>  lib-$(CONFIG_GENERIC_BUG) += bug.o
>  
>  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> new file mode 100644
> index 000000000000..4fc031f9cefd
> --- /dev/null
> +++ b/lib/alloc_tag.c
> @@ -0,0 +1,158 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/alloc_tag.h>
> +#include <linux/fs.h>
> +#include <linux/gfp.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_buf.h>
> +#include <linux/seq_file.h>
> +
> +static struct codetag_type *alloc_tag_cttype;
> +
> +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> +			mem_alloc_profiling_key);
> +
> +static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> +{
> +	struct codetag_iterator *iter;
> +	struct codetag *ct;
> +	loff_t node = *pos;
> +
> +	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> +	m->private = iter;
> +	if (!iter)
> +		return NULL;
> +
> +	codetag_lock_module_list(alloc_tag_cttype, true);
> +	*iter = codetag_get_ct_iter(alloc_tag_cttype);
> +	while ((ct = codetag_next_ct(iter)) != NULL && node)
> +		node--;
> +
> +	return ct ? iter : NULL;
> +}
> +
> +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
> +{
> +	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> +	struct codetag *ct = codetag_next_ct(iter);
> +
> +	(*pos)++;
> +	if (!ct)
> +		return NULL;
> +
> +	return iter;
> +}
> +
> +static void allocinfo_stop(struct seq_file *m, void *arg)
> +{
> +	struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
> +
> +	if (iter) {
> +		codetag_lock_module_list(alloc_tag_cttype, false);
> +		kfree(iter);
> +	}
> +}
> +
> +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
> +{
> +	struct alloc_tag *tag = ct_to_alloc_tag(ct);
> +	struct alloc_tag_counters counter = alloc_tag_read(tag);
> +	s64 bytes = counter.bytes;
> +	char val[10], *p = val;
> +
> +	if (bytes < 0) {
> +		*p++ = '-';
> +		bytes = -bytes;
> +	}
> +
> +	string_get_size(bytes, 1,
> +			STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
> +			p, val + ARRAY_SIZE(val) - p);
> +
> +	seq_buf_printf(out, "%8s %8llu ", val, counter.calls);
> +	codetag_to_text(out, ct);
> +	seq_buf_putc(out, ' ');
> +	seq_buf_putc(out, '\n');
> +}

/me does happy seq_buf dance!

> +
> +static int allocinfo_show(struct seq_file *m, void *arg)
> +{
> +	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> +	char *bufp;
> +	size_t n = seq_get_buf(m, &bufp);
> +	struct seq_buf buf;
> +
> +	seq_buf_init(&buf, bufp, n);
> +	alloc_tag_to_text(&buf, iter->ct);
> +	seq_commit(m, seq_buf_used(&buf));
> +	return 0;
> +}
> +
> +static const struct seq_operations allocinfo_seq_op = {
> +	.start	= allocinfo_start,
> +	.next	= allocinfo_next,
> +	.stop	= allocinfo_stop,
> +	.show	= allocinfo_show,
> +};
> +
> +static void __init procfs_init(void)
> +{
> +	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> +}

As mentioned, this really should be in /sys somewhere.

> +
> +static bool alloc_tag_module_unload(struct codetag_type *cttype,
> +				    struct codetag_module *cmod)
> +{
> +	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
> +	struct alloc_tag_counters counter;
> +	bool module_unused = true;
> +	struct alloc_tag *tag;
> +	struct codetag *ct;
> +
> +	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> +		if (iter.cmod != cmod)
> +			continue;
> +
> +		tag = ct_to_alloc_tag(ct);
> +		counter = alloc_tag_read(tag);
> +
> +		if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload",
> +			  ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> +			module_unused = false;
> +	}
> +
> +	return module_unused;
> +}
> +
> +static struct ctl_table memory_allocation_profiling_sysctls[] = {
> +	{
> +		.procname	= "mem_profiling",
> +		.data		= &mem_alloc_profiling_key,
> +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> +		.mode		= 0444,
> +#else
> +		.mode		= 0644,
> +#endif
> +		.proc_handler	= proc_do_static_key,
> +	},
> +	{ }
> +};
> +
> +static int __init alloc_tag_init(void)
> +{
> +	const struct codetag_type_desc desc = {
> +		.section	= "alloc_tags",
> +		.tag_size	= sizeof(struct alloc_tag),
> +		.module_unload	= alloc_tag_module_unload,
> +	};
> +
> +	alloc_tag_cttype = codetag_register_type(&desc);
> +	if (IS_ERR_OR_NULL(alloc_tag_cttype))
> +		return PTR_ERR(alloc_tag_cttype);
> +
> +	register_sysctl_init("vm", memory_allocation_profiling_sysctls);
> +	procfs_init();
> +
> +	return 0;
> +}
> +module_init(alloc_tag_init);
> diff --git a/scripts/module.lds.S b/scripts/module.lds.S
> index bf5bcf2836d8..45c67a0994f3 100644
> --- a/scripts/module.lds.S
> +++ b/scripts/module.lds.S
> @@ -9,6 +9,8 @@
>  #define DISCARD_EH_FRAME	*(.eh_frame)
>  #endif
>  
> +#include <asm-generic/codetag.lds.h>
> +
>  SECTIONS {
>  	/DISCARD/ : {
>  		*(.discard)
> @@ -47,12 +49,17 @@ SECTIONS {
>  	.data : {
>  		*(.data .data.[0-9a-zA-Z_]*)
>  		*(.data..L*)
> +		CODETAG_SECTIONS()
>  	}
>  
>  	.rodata : {
>  		*(.rodata .rodata.[0-9a-zA-Z_]*)
>  		*(.rodata..L*)
>  	}
> +#else
> +	.data : {
> +		CODETAG_SECTIONS()
> +	}
>  #endif
>  }

Otherwise, looks good.
Suren Baghdasaryan Feb. 13, 2024, 1:01 a.m. UTC | #2
On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote:
>
> On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> > instrument memory allocators. It registers an "alloc_tags" codetag type
> > with /proc/allocinfo interface to output allocation tag information when
>
> Please don't add anything new to the top-level /proc directory. This
> should likely live in /sys.

Ack. I'll find a more appropriate place for it then.
It just seemed like such generic information which would belong next
to meminfo/zoneinfo and such...

>
> > the feature is enabled.
> > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> > allocation profiling instrumentation.
> > Memory allocation profiling can be enabled or disabled at runtime using
> > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> > profiling by default.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > ---
> >  Documentation/admin-guide/sysctl/vm.rst |  16 +++
> >  Documentation/filesystems/proc.rst      |  28 +++++
> >  include/asm-generic/codetag.lds.h       |  14 +++
> >  include/asm-generic/vmlinux.lds.h       |   3 +
> >  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
> >  include/linux/sched.h                   |  24 ++++
> >  lib/Kconfig.debug                       |  25 ++++
> >  lib/Makefile                            |   2 +
> >  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
> >  scripts/module.lds.S                    |   7 ++
> >  10 files changed, 410 insertions(+)
> >  create mode 100644 include/asm-generic/codetag.lds.h
> >  create mode 100644 include/linux/alloc_tag.h
> >  create mode 100644 lib/alloc_tag.c
> >
> > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > index c59889de122b..a214719492ea 100644
> > --- a/Documentation/admin-guide/sysctl/vm.rst
> > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
> >  - legacy_va_layout
> >  - lowmem_reserve_ratio
> >  - max_map_count
> > +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
> >  - memory_failure_early_kill
> >  - memory_failure_recovery
> >  - min_free_kbytes
> > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
> >  The default value is 65530.
> >
> >
> > +mem_profiling
> > +==============
> > +
> > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> > +
> > +1: Enable memory profiling.
> > +
> > +0: Disabld memory profiling.
> > +
> > +Enabling memory profiling introduces a small performance overhead for all
> > +memory allocations.
> > +
> > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
> > +
> > +
> >  memory_failure_early_kill:
> >  ==========================
> >
> > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> > index 104c6d047d9b..40d6d18308e4 100644
> > --- a/Documentation/filesystems/proc.rst
> > +++ b/Documentation/filesystems/proc.rst
> > @@ -688,6 +688,7 @@ files are there, and which are missing.
> >   ============ ===============================================================
> >   File         Content
> >   ============ ===============================================================
> > + allocinfo    Memory allocations profiling information
> >   apm          Advanced power management info
> >   bootconfig   Kernel command line obtained from boot config,
> >             and, if there were kernel parameters from the
> > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be
> >  reclaimed to achieve this.
> >
> >
> > +allocinfo
> > +~~~~~~~
> > +
> > +Provides information about memory allocations at all locations in the code
> > +base. Each allocation in the code is identified by its source file, line
> > +number, module and the function calling the allocation. The number of bytes
> > +allocated at each location is reported.
> > +
> > +Example output.
> > +
> > +::
> > +
> > +    > cat /proc/allocinfo
> > +
> > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> > +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> > +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> > +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> > +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> > +      ...
> > +
> > +
> >  meminfo
> >  ~~~~~~~
> >
> > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
> > new file mode 100644
> > index 000000000000..64f536b80380
> > --- /dev/null
> > +++ b/include/asm-generic/codetag.lds.h
> > @@ -0,0 +1,14 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +#ifndef __ASM_GENERIC_CODETAG_LDS_H
> > +#define __ASM_GENERIC_CODETAG_LDS_H
> > +
> > +#define SECTION_WITH_BOUNDARIES(_name)       \
> > +     . = ALIGN(8);                   \
> > +     __start_##_name = .;            \
> > +     KEEP(*(_name))                  \
> > +     __stop_##_name = .;
> > +
> > +#define CODETAG_SECTIONS()           \
> > +     SECTION_WITH_BOUNDARIES(alloc_tags)
> > +
> > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */
> > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> > index 5dd3a61d673d..c9997dc50c50 100644
> > --- a/include/asm-generic/vmlinux.lds.h
> > +++ b/include/asm-generic/vmlinux.lds.h
> > @@ -50,6 +50,8 @@
> >   *               [__nosave_begin, __nosave_end] for the nosave data
> >   */
> >
> > +#include <asm-generic/codetag.lds.h>
> > +
> >  #ifndef LOAD_OFFSET
> >  #define LOAD_OFFSET 0
> >  #endif
> > @@ -366,6 +368,7 @@
> >       . = ALIGN(8);                                                   \
> >       BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)         \
> >       BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)                         \
> > +     CODETAG_SECTIONS()                                              \
> >       LIKELY_PROFILE()                                                \
> >       BRANCH_PROFILE()                                                \
> >       TRACE_PRINTKS()                                                 \
> > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> > new file mode 100644
> > index 000000000000..cf55a149fa84
> > --- /dev/null
> > +++ b/include/linux/alloc_tag.h
> > @@ -0,0 +1,133 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +/*
> > + * allocation tagging
> > + */
> > +#ifndef _LINUX_ALLOC_TAG_H
> > +#define _LINUX_ALLOC_TAG_H
> > +
> > +#include <linux/bug.h>
> > +#include <linux/codetag.h>
> > +#include <linux/container_of.h>
> > +#include <linux/preempt.h>
> > +#include <asm/percpu.h>
> > +#include <linux/cpumask.h>
> > +#include <linux/static_key.h>
> > +
> > +struct alloc_tag_counters {
> > +     u64 bytes;
> > +     u64 calls;
> > +};
> > +
> > +/*
> > + * An instance of this structure is created in a special ELF section at every
> > + * allocation callsite. At runtime, the special section is treated as
> > + * an array of these. Embedded codetag utilizes codetag framework.
> > + */
> > +struct alloc_tag {
> > +     struct codetag                  ct;
> > +     struct alloc_tag_counters __percpu      *counters;
> > +} __aligned(8);
> > +
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +
> > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
> > +{
> > +     return container_of(ct, struct alloc_tag, ct);
> > +}
> > +
> > +#ifdef ARCH_NEEDS_WEAK_PER_CPU
> > +/*
> > + * When percpu variables are required to be defined as weak, static percpu
> > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
> > + */
> > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU"
>
> Is this enforced via Kconfig as well? (Looks like only alpha and s390?)

Unfortunately ARCH_NEEDS_WEAK_PER_CPU is not a Kconfig option but
CONFIG_DEBUG_FORCE_WEAK_PER_CPU is, so that one is handled via Kconfig
(see "depends on !DEBUG_FORCE_WEAK_PER_CPU" in this patch). We have to
avoid both cases because of this:
https://elixir.bootlin.com/linux/latest/source/include/linux/percpu-defs.h#L75,
so I'm trying to provide an informative error here.

>
> > +#endif
> > +
> > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)                                   \
> > +     static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);      \
> > +     static struct alloc_tag _alloc_tag __used __aligned(8)                  \
> > +     __section("alloc_tags") = {                                             \
> > +             .ct = CODE_TAG_INIT,                                            \
> > +             .counters = &_alloc_tag_cntr };                                 \
> > +     struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
> > +
> > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > +                     mem_alloc_profiling_key);
> > +
> > +static inline bool mem_alloc_profiling_enabled(void)
> > +{
> > +     return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > +                                &mem_alloc_profiling_key);
> > +}
> > +
> > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> > +{
> > +     struct alloc_tag_counters v = { 0, 0 };
> > +     struct alloc_tag_counters *counter;
> > +     int cpu;
> > +
> > +     for_each_possible_cpu(cpu) {
> > +             counter = per_cpu_ptr(tag->counters, cpu);
> > +             v.bytes += counter->bytes;
> > +             v.calls += counter->calls;
> > +     }
> > +
> > +     return v;
> > +}
> > +
> > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> > +{
> > +     struct alloc_tag *tag;
> > +
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > +     WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> > +#endif
> > +     if (!ref || !ref->ct)
> > +             return;
> > +
> > +     tag = ct_to_alloc_tag(ref->ct);
> > +
> > +     this_cpu_sub(tag->counters->bytes, bytes);
> > +     this_cpu_dec(tag->counters->calls);
> > +
> > +     ref->ct = NULL;
> > +}
> > +
> > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> > +{
> > +     __alloc_tag_sub(ref, bytes);
> > +}
> > +
> > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
> > +{
> > +     __alloc_tag_sub(ref, bytes);
> > +}
> > +
> > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
> > +{
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > +     WARN_ONCE(ref && ref->ct,
> > +               "alloc_tag was not cleared (got tag for %s:%u)\n",\
> > +               ref->ct->filename, ref->ct->lineno);
> > +
> > +     WARN_ONCE(!tag, "current->alloc_tag not set");
> > +#endif
> > +     if (!ref || !tag)
> > +             return;
> > +
> > +     ref->ct = &tag->ct;
> > +     this_cpu_add(tag->counters->bytes, bytes);
> > +     this_cpu_inc(tag->counters->calls);
> > +}
> > +
> > +#else
> > +
> > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
> > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
> > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
> > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
> > +                              size_t bytes) {}
> > +
> > +#endif
> > +
> > +#endif /* _LINUX_ALLOC_TAG_H */
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index ffe8f618ab86..da68a10517c8 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -770,6 +770,10 @@ struct task_struct {
> >       unsigned int                    flags;
> >       unsigned int                    ptrace;
> >
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +     struct alloc_tag                *alloc_tag;
> > +#endif
>
> Normally scheduling is very sensitive to having anything early in
> task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
> area.

Thanks for the warning! We will look into that.

>
> > +
> >  #ifdef CONFIG_SMP
> >       int                             on_cpu;
> >       struct __call_single_node       wake_entry;
> > @@ -810,6 +814,7 @@ struct task_struct {
> >       struct task_group               *sched_task_group;
> >  #endif
> >
> > +
> >  #ifdef CONFIG_UCLAMP_TASK
> >       /*
> >        * Clamp values requested for a scheduling entity.
> > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
> >
> >  extern void sched_set_stop_task(int cpu, struct task_struct *stop);
> >
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
> > +{
> > +     swap(current->alloc_tag, tag);
> > +     return tag;
> > +}
> > +
> > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
> > +{
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > +     WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
> > +#endif
> > +     current->alloc_tag = old;
> > +}
> > +#else
> > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
> > +#define alloc_tag_restore(_tag, _old)
> > +#endif
> > +
> >  #endif
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 0be2d00c3696..78d258ca508f 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -972,6 +972,31 @@ config CODE_TAGGING
> >       bool
> >       select KALLSYMS
> >
> > +config MEM_ALLOC_PROFILING
> > +     bool "Enable memory allocation profiling"
> > +     default n
> > +     depends on PROC_FS
> > +     depends on !DEBUG_FORCE_WEAK_PER_CPU
> > +     select CODE_TAGGING
> > +     help
> > +       Track allocation source code and record total allocation size
> > +       initiated at that code location. The mechanism can be used to track
> > +       memory leaks with a low performance and memory impact.
> > +
> > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > +     bool "Enable memory allocation profiling by default"
> > +     default y
> > +     depends on MEM_ALLOC_PROFILING
> > +
> > +config MEM_ALLOC_PROFILING_DEBUG
> > +     bool "Memory allocation profiler debugging"
> > +     default n
> > +     depends on MEM_ALLOC_PROFILING
> > +     select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > +     help
> > +       Adds warnings with helpful error messages for memory allocation
> > +       profiling.
> > +
> >  source "lib/Kconfig.kasan"
> >  source "lib/Kconfig.kfence"
> >  source "lib/Kconfig.kmsan"
> > diff --git a/lib/Makefile b/lib/Makefile
> > index 6b48b22fdfac..859112f09bf5 100644
> > --- a/lib/Makefile
> > +++ b/lib/Makefile
> > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
> >  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> >
> >  obj-$(CONFIG_CODE_TAGGING) += codetag.o
> > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
> > +
> >  lib-$(CONFIG_GENERIC_BUG) += bug.o
> >
> >  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > new file mode 100644
> > index 000000000000..4fc031f9cefd
> > --- /dev/null
> > +++ b/lib/alloc_tag.c
> > @@ -0,0 +1,158 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +#include <linux/alloc_tag.h>
> > +#include <linux/fs.h>
> > +#include <linux/gfp.h>
> > +#include <linux/module.h>
> > +#include <linux/proc_fs.h>
> > +#include <linux/seq_buf.h>
> > +#include <linux/seq_file.h>
> > +
> > +static struct codetag_type *alloc_tag_cttype;
> > +
> > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > +                     mem_alloc_profiling_key);
> > +
> > +static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> > +{
> > +     struct codetag_iterator *iter;
> > +     struct codetag *ct;
> > +     loff_t node = *pos;
> > +
> > +     iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> > +     m->private = iter;
> > +     if (!iter)
> > +             return NULL;
> > +
> > +     codetag_lock_module_list(alloc_tag_cttype, true);
> > +     *iter = codetag_get_ct_iter(alloc_tag_cttype);
> > +     while ((ct = codetag_next_ct(iter)) != NULL && node)
> > +             node--;
> > +
> > +     return ct ? iter : NULL;
> > +}
> > +
> > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
> > +{
> > +     struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> > +     struct codetag *ct = codetag_next_ct(iter);
> > +
> > +     (*pos)++;
> > +     if (!ct)
> > +             return NULL;
> > +
> > +     return iter;
> > +}
> > +
> > +static void allocinfo_stop(struct seq_file *m, void *arg)
> > +{
> > +     struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
> > +
> > +     if (iter) {
> > +             codetag_lock_module_list(alloc_tag_cttype, false);
> > +             kfree(iter);
> > +     }
> > +}
> > +
> > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
> > +{
> > +     struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > +     struct alloc_tag_counters counter = alloc_tag_read(tag);
> > +     s64 bytes = counter.bytes;
> > +     char val[10], *p = val;
> > +
> > +     if (bytes < 0) {
> > +             *p++ = '-';
> > +             bytes = -bytes;
> > +     }
> > +
> > +     string_get_size(bytes, 1,
> > +                     STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
> > +                     p, val + ARRAY_SIZE(val) - p);
> > +
> > +     seq_buf_printf(out, "%8s %8llu ", val, counter.calls);
> > +     codetag_to_text(out, ct);
> > +     seq_buf_putc(out, ' ');
> > +     seq_buf_putc(out, '\n');
> > +}
>
> /me does happy seq_buf dance!
>
> > +
> > +static int allocinfo_show(struct seq_file *m, void *arg)
> > +{
> > +     struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> > +     char *bufp;
> > +     size_t n = seq_get_buf(m, &bufp);
> > +     struct seq_buf buf;
> > +
> > +     seq_buf_init(&buf, bufp, n);
> > +     alloc_tag_to_text(&buf, iter->ct);
> > +     seq_commit(m, seq_buf_used(&buf));
> > +     return 0;
> > +}
> > +
> > +static const struct seq_operations allocinfo_seq_op = {
> > +     .start  = allocinfo_start,
> > +     .next   = allocinfo_next,
> > +     .stop   = allocinfo_stop,
> > +     .show   = allocinfo_show,
> > +};
> > +
> > +static void __init procfs_init(void)
> > +{
> > +     proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> > +}
>
> As mentioned, this really should be in /sys somewhere.

Ack.

>
> > +
> > +static bool alloc_tag_module_unload(struct codetag_type *cttype,
> > +                                 struct codetag_module *cmod)
> > +{
> > +     struct codetag_iterator iter = codetag_get_ct_iter(cttype);
> > +     struct alloc_tag_counters counter;
> > +     bool module_unused = true;
> > +     struct alloc_tag *tag;
> > +     struct codetag *ct;
> > +
> > +     for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> > +             if (iter.cmod != cmod)
> > +                     continue;
> > +
> > +             tag = ct_to_alloc_tag(ct);
> > +             counter = alloc_tag_read(tag);
> > +
> > +             if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload",
> > +                       ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> > +                     module_unused = false;
> > +     }
> > +
> > +     return module_unused;
> > +}
> > +
> > +static struct ctl_table memory_allocation_profiling_sysctls[] = {
> > +     {
> > +             .procname       = "mem_profiling",
> > +             .data           = &mem_alloc_profiling_key,
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > +             .mode           = 0444,
> > +#else
> > +             .mode           = 0644,
> > +#endif
> > +             .proc_handler   = proc_do_static_key,
> > +     },
> > +     { }
> > +};
> > +
> > +static int __init alloc_tag_init(void)
> > +{
> > +     const struct codetag_type_desc desc = {
> > +             .section        = "alloc_tags",
> > +             .tag_size       = sizeof(struct alloc_tag),
> > +             .module_unload  = alloc_tag_module_unload,
> > +     };
> > +
> > +     alloc_tag_cttype = codetag_register_type(&desc);
> > +     if (IS_ERR_OR_NULL(alloc_tag_cttype))
> > +             return PTR_ERR(alloc_tag_cttype);
> > +
> > +     register_sysctl_init("vm", memory_allocation_profiling_sysctls);
> > +     procfs_init();
> > +
> > +     return 0;
> > +}
> > +module_init(alloc_tag_init);
> > diff --git a/scripts/module.lds.S b/scripts/module.lds.S
> > index bf5bcf2836d8..45c67a0994f3 100644
> > --- a/scripts/module.lds.S
> > +++ b/scripts/module.lds.S
> > @@ -9,6 +9,8 @@
> >  #define DISCARD_EH_FRAME     *(.eh_frame)
> >  #endif
> >
> > +#include <asm-generic/codetag.lds.h>
> > +
> >  SECTIONS {
> >       /DISCARD/ : {
> >               *(.discard)
> > @@ -47,12 +49,17 @@ SECTIONS {
> >       .data : {
> >               *(.data .data.[0-9a-zA-Z_]*)
> >               *(.data..L*)
> > +             CODETAG_SECTIONS()
> >       }
> >
> >       .rodata : {
> >               *(.rodata .rodata.[0-9a-zA-Z_]*)
> >               *(.rodata..L*)
> >       }
> > +#else
> > +     .data : {
> > +             CODETAG_SECTIONS()
> > +     }
> >  #endif
> >  }
>
> Otherwise, looks good.

Thanks!

>
> --
> Kees Cook
Darrick J. Wong Feb. 13, 2024, 10:28 p.m. UTC | #3
On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote:
> On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote:
> >
> > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> > > instrument memory allocators. It registers an "alloc_tags" codetag type
> > > with /proc/allocinfo interface to output allocation tag information when
> >
> > Please don't add anything new to the top-level /proc directory. This
> > should likely live in /sys.
> 
> Ack. I'll find a more appropriate place for it then.
> It just seemed like such generic information which would belong next
> to meminfo/zoneinfo and such...

Save yourself a cycle of "rework the whole fs interface only to have
someone else tell you no" and put it in debugfs, not sysfs.  Wrangling
with debugfs is easier than all the macro-happy sysfs stuff; you don't
have to integrate with the "device" model; and there is no 'one value
per file' rule.

--D

> >
> > > the feature is enabled.
> > > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> > > allocation profiling instrumentation.
> > > Memory allocation profiling can be enabled or disabled at runtime using
> > > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> > > profiling by default.
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
> > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > > ---
> > >  Documentation/admin-guide/sysctl/vm.rst |  16 +++
> > >  Documentation/filesystems/proc.rst      |  28 +++++
> > >  include/asm-generic/codetag.lds.h       |  14 +++
> > >  include/asm-generic/vmlinux.lds.h       |   3 +
> > >  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
> > >  include/linux/sched.h                   |  24 ++++
> > >  lib/Kconfig.debug                       |  25 ++++
> > >  lib/Makefile                            |   2 +
> > >  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
> > >  scripts/module.lds.S                    |   7 ++
> > >  10 files changed, 410 insertions(+)
> > >  create mode 100644 include/asm-generic/codetag.lds.h
> > >  create mode 100644 include/linux/alloc_tag.h
> > >  create mode 100644 lib/alloc_tag.c
> > >
> > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > > index c59889de122b..a214719492ea 100644
> > > --- a/Documentation/admin-guide/sysctl/vm.rst
> > > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
> > >  - legacy_va_layout
> > >  - lowmem_reserve_ratio
> > >  - max_map_count
> > > +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
> > >  - memory_failure_early_kill
> > >  - memory_failure_recovery
> > >  - min_free_kbytes
> > > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
> > >  The default value is 65530.
> > >
> > >
> > > +mem_profiling
> > > +==============
> > > +
> > > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> > > +
> > > +1: Enable memory profiling.
> > > +
> > > +0: Disabld memory profiling.
> > > +
> > > +Enabling memory profiling introduces a small performance overhead for all
> > > +memory allocations.
> > > +
> > > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
> > > +
> > > +
> > >  memory_failure_early_kill:
> > >  ==========================
> > >
> > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> > > index 104c6d047d9b..40d6d18308e4 100644
> > > --- a/Documentation/filesystems/proc.rst
> > > +++ b/Documentation/filesystems/proc.rst
> > > @@ -688,6 +688,7 @@ files are there, and which are missing.
> > >   ============ ===============================================================
> > >   File         Content
> > >   ============ ===============================================================
> > > + allocinfo    Memory allocations profiling information
> > >   apm          Advanced power management info
> > >   bootconfig   Kernel command line obtained from boot config,
> > >             and, if there were kernel parameters from the
> > > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be
> > >  reclaimed to achieve this.
> > >
> > >
> > > +allocinfo
> > > +~~~~~~~
> > > +
> > > +Provides information about memory allocations at all locations in the code
> > > +base. Each allocation in the code is identified by its source file, line
> > > +number, module and the function calling the allocation. The number of bytes
> > > +allocated at each location is reported.
> > > +
> > > +Example output.
> > > +
> > > +::
> > > +
> > > +    > cat /proc/allocinfo
> > > +
> > > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > > +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> > > +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> > > +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> > > +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> > > +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> > > +      ...
> > > +
> > > +
> > >  meminfo
> > >  ~~~~~~~
> > >
> > > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
> > > new file mode 100644
> > > index 000000000000..64f536b80380
> > > --- /dev/null
> > > +++ b/include/asm-generic/codetag.lds.h
> > > @@ -0,0 +1,14 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +#ifndef __ASM_GENERIC_CODETAG_LDS_H
> > > +#define __ASM_GENERIC_CODETAG_LDS_H
> > > +
> > > +#define SECTION_WITH_BOUNDARIES(_name)       \
> > > +     . = ALIGN(8);                   \
> > > +     __start_##_name = .;            \
> > > +     KEEP(*(_name))                  \
> > > +     __stop_##_name = .;
> > > +
> > > +#define CODETAG_SECTIONS()           \
> > > +     SECTION_WITH_BOUNDARIES(alloc_tags)
> > > +
> > > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */
> > > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> > > index 5dd3a61d673d..c9997dc50c50 100644
> > > --- a/include/asm-generic/vmlinux.lds.h
> > > +++ b/include/asm-generic/vmlinux.lds.h
> > > @@ -50,6 +50,8 @@
> > >   *               [__nosave_begin, __nosave_end] for the nosave data
> > >   */
> > >
> > > +#include <asm-generic/codetag.lds.h>
> > > +
> > >  #ifndef LOAD_OFFSET
> > >  #define LOAD_OFFSET 0
> > >  #endif
> > > @@ -366,6 +368,7 @@
> > >       . = ALIGN(8);                                                   \
> > >       BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)         \
> > >       BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)                         \
> > > +     CODETAG_SECTIONS()                                              \
> > >       LIKELY_PROFILE()                                                \
> > >       BRANCH_PROFILE()                                                \
> > >       TRACE_PRINTKS()                                                 \
> > > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> > > new file mode 100644
> > > index 000000000000..cf55a149fa84
> > > --- /dev/null
> > > +++ b/include/linux/alloc_tag.h
> > > @@ -0,0 +1,133 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +/*
> > > + * allocation tagging
> > > + */
> > > +#ifndef _LINUX_ALLOC_TAG_H
> > > +#define _LINUX_ALLOC_TAG_H
> > > +
> > > +#include <linux/bug.h>
> > > +#include <linux/codetag.h>
> > > +#include <linux/container_of.h>
> > > +#include <linux/preempt.h>
> > > +#include <asm/percpu.h>
> > > +#include <linux/cpumask.h>
> > > +#include <linux/static_key.h>
> > > +
> > > +struct alloc_tag_counters {
> > > +     u64 bytes;
> > > +     u64 calls;
> > > +};
> > > +
> > > +/*
> > > + * An instance of this structure is created in a special ELF section at every
> > > + * allocation callsite. At runtime, the special section is treated as
> > > + * an array of these. Embedded codetag utilizes codetag framework.
> > > + */
> > > +struct alloc_tag {
> > > +     struct codetag                  ct;
> > > +     struct alloc_tag_counters __percpu      *counters;
> > > +} __aligned(8);
> > > +
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +
> > > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
> > > +{
> > > +     return container_of(ct, struct alloc_tag, ct);
> > > +}
> > > +
> > > +#ifdef ARCH_NEEDS_WEAK_PER_CPU
> > > +/*
> > > + * When percpu variables are required to be defined as weak, static percpu
> > > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
> > > + */
> > > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU"
> >
> > Is this enforced via Kconfig as well? (Looks like only alpha and s390?)
> 
> Unfortunately ARCH_NEEDS_WEAK_PER_CPU is not a Kconfig option but
> CONFIG_DEBUG_FORCE_WEAK_PER_CPU is, so that one is handled via Kconfig
> (see "depends on !DEBUG_FORCE_WEAK_PER_CPU" in this patch). We have to
> avoid both cases because of this:
> https://elixir.bootlin.com/linux/latest/source/include/linux/percpu-defs.h#L75,
> so I'm trying to provide an informative error here.
> 
> >
> > > +#endif
> > > +
> > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)                                   \
> > > +     static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);      \
> > > +     static struct alloc_tag _alloc_tag __used __aligned(8)                  \
> > > +     __section("alloc_tags") = {                                             \
> > > +             .ct = CODE_TAG_INIT,                                            \
> > > +             .counters = &_alloc_tag_cntr };                                 \
> > > +     struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
> > > +
> > > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > > +                     mem_alloc_profiling_key);
> > > +
> > > +static inline bool mem_alloc_profiling_enabled(void)
> > > +{
> > > +     return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > > +                                &mem_alloc_profiling_key);
> > > +}
> > > +
> > > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> > > +{
> > > +     struct alloc_tag_counters v = { 0, 0 };
> > > +     struct alloc_tag_counters *counter;
> > > +     int cpu;
> > > +
> > > +     for_each_possible_cpu(cpu) {
> > > +             counter = per_cpu_ptr(tag->counters, cpu);
> > > +             v.bytes += counter->bytes;
> > > +             v.calls += counter->calls;
> > > +     }
> > > +
> > > +     return v;
> > > +}
> > > +
> > > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> > > +{
> > > +     struct alloc_tag *tag;
> > > +
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > +     WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> > > +#endif
> > > +     if (!ref || !ref->ct)
> > > +             return;
> > > +
> > > +     tag = ct_to_alloc_tag(ref->ct);
> > > +
> > > +     this_cpu_sub(tag->counters->bytes, bytes);
> > > +     this_cpu_dec(tag->counters->calls);
> > > +
> > > +     ref->ct = NULL;
> > > +}
> > > +
> > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> > > +{
> > > +     __alloc_tag_sub(ref, bytes);
> > > +}
> > > +
> > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
> > > +{
> > > +     __alloc_tag_sub(ref, bytes);
> > > +}
> > > +
> > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
> > > +{
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > +     WARN_ONCE(ref && ref->ct,
> > > +               "alloc_tag was not cleared (got tag for %s:%u)\n",\
> > > +               ref->ct->filename, ref->ct->lineno);
> > > +
> > > +     WARN_ONCE(!tag, "current->alloc_tag not set");
> > > +#endif
> > > +     if (!ref || !tag)
> > > +             return;
> > > +
> > > +     ref->ct = &tag->ct;
> > > +     this_cpu_add(tag->counters->bytes, bytes);
> > > +     this_cpu_inc(tag->counters->calls);
> > > +}
> > > +
> > > +#else
> > > +
> > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
> > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
> > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
> > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
> > > +                              size_t bytes) {}
> > > +
> > > +#endif
> > > +
> > > +#endif /* _LINUX_ALLOC_TAG_H */
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index ffe8f618ab86..da68a10517c8 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -770,6 +770,10 @@ struct task_struct {
> > >       unsigned int                    flags;
> > >       unsigned int                    ptrace;
> > >
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +     struct alloc_tag                *alloc_tag;
> > > +#endif
> >
> > Normally scheduling is very sensitive to having anything early in
> > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
> > area.
> 
> Thanks for the warning! We will look into that.
> 
> >
> > > +
> > >  #ifdef CONFIG_SMP
> > >       int                             on_cpu;
> > >       struct __call_single_node       wake_entry;
> > > @@ -810,6 +814,7 @@ struct task_struct {
> > >       struct task_group               *sched_task_group;
> > >  #endif
> > >
> > > +
> > >  #ifdef CONFIG_UCLAMP_TASK
> > >       /*
> > >        * Clamp values requested for a scheduling entity.
> > > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
> > >
> > >  extern void sched_set_stop_task(int cpu, struct task_struct *stop);
> > >
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
> > > +{
> > > +     swap(current->alloc_tag, tag);
> > > +     return tag;
> > > +}
> > > +
> > > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
> > > +{
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > +     WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
> > > +#endif
> > > +     current->alloc_tag = old;
> > > +}
> > > +#else
> > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
> > > +#define alloc_tag_restore(_tag, _old)
> > > +#endif
> > > +
> > >  #endif
> > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > > index 0be2d00c3696..78d258ca508f 100644
> > > --- a/lib/Kconfig.debug
> > > +++ b/lib/Kconfig.debug
> > > @@ -972,6 +972,31 @@ config CODE_TAGGING
> > >       bool
> > >       select KALLSYMS
> > >
> > > +config MEM_ALLOC_PROFILING
> > > +     bool "Enable memory allocation profiling"
> > > +     default n
> > > +     depends on PROC_FS
> > > +     depends on !DEBUG_FORCE_WEAK_PER_CPU
> > > +     select CODE_TAGGING
> > > +     help
> > > +       Track allocation source code and record total allocation size
> > > +       initiated at that code location. The mechanism can be used to track
> > > +       memory leaks with a low performance and memory impact.
> > > +
> > > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > > +     bool "Enable memory allocation profiling by default"
> > > +     default y
> > > +     depends on MEM_ALLOC_PROFILING
> > > +
> > > +config MEM_ALLOC_PROFILING_DEBUG
> > > +     bool "Memory allocation profiler debugging"
> > > +     default n
> > > +     depends on MEM_ALLOC_PROFILING
> > > +     select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > > +     help
> > > +       Adds warnings with helpful error messages for memory allocation
> > > +       profiling.
> > > +
> > >  source "lib/Kconfig.kasan"
> > >  source "lib/Kconfig.kfence"
> > >  source "lib/Kconfig.kmsan"
> > > diff --git a/lib/Makefile b/lib/Makefile
> > > index 6b48b22fdfac..859112f09bf5 100644
> > > --- a/lib/Makefile
> > > +++ b/lib/Makefile
> > > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
> > >  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> > >
> > >  obj-$(CONFIG_CODE_TAGGING) += codetag.o
> > > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
> > > +
> > >  lib-$(CONFIG_GENERIC_BUG) += bug.o
> > >
> > >  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> > > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > > new file mode 100644
> > > index 000000000000..4fc031f9cefd
> > > --- /dev/null
> > > +++ b/lib/alloc_tag.c
> > > @@ -0,0 +1,158 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +#include <linux/alloc_tag.h>
> > > +#include <linux/fs.h>
> > > +#include <linux/gfp.h>
> > > +#include <linux/module.h>
> > > +#include <linux/proc_fs.h>
> > > +#include <linux/seq_buf.h>
> > > +#include <linux/seq_file.h>
> > > +
> > > +static struct codetag_type *alloc_tag_cttype;
> > > +
> > > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > > +                     mem_alloc_profiling_key);
> > > +
> > > +static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> > > +{
> > > +     struct codetag_iterator *iter;
> > > +     struct codetag *ct;
> > > +     loff_t node = *pos;
> > > +
> > > +     iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> > > +     m->private = iter;
> > > +     if (!iter)
> > > +             return NULL;
> > > +
> > > +     codetag_lock_module_list(alloc_tag_cttype, true);
> > > +     *iter = codetag_get_ct_iter(alloc_tag_cttype);
> > > +     while ((ct = codetag_next_ct(iter)) != NULL && node)
> > > +             node--;
> > > +
> > > +     return ct ? iter : NULL;
> > > +}
> > > +
> > > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
> > > +{
> > > +     struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> > > +     struct codetag *ct = codetag_next_ct(iter);
> > > +
> > > +     (*pos)++;
> > > +     if (!ct)
> > > +             return NULL;
> > > +
> > > +     return iter;
> > > +}
> > > +
> > > +static void allocinfo_stop(struct seq_file *m, void *arg)
> > > +{
> > > +     struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
> > > +
> > > +     if (iter) {
> > > +             codetag_lock_module_list(alloc_tag_cttype, false);
> > > +             kfree(iter);
> > > +     }
> > > +}
> > > +
> > > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
> > > +{
> > > +     struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > > +     struct alloc_tag_counters counter = alloc_tag_read(tag);
> > > +     s64 bytes = counter.bytes;
> > > +     char val[10], *p = val;
> > > +
> > > +     if (bytes < 0) {
> > > +             *p++ = '-';
> > > +             bytes = -bytes;
> > > +     }
> > > +
> > > +     string_get_size(bytes, 1,
> > > +                     STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
> > > +                     p, val + ARRAY_SIZE(val) - p);
> > > +
> > > +     seq_buf_printf(out, "%8s %8llu ", val, counter.calls);
> > > +     codetag_to_text(out, ct);
> > > +     seq_buf_putc(out, ' ');
> > > +     seq_buf_putc(out, '\n');
> > > +}
> >
> > /me does happy seq_buf dance!
> >
> > > +
> > > +static int allocinfo_show(struct seq_file *m, void *arg)
> > > +{
> > > +     struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> > > +     char *bufp;
> > > +     size_t n = seq_get_buf(m, &bufp);
> > > +     struct seq_buf buf;
> > > +
> > > +     seq_buf_init(&buf, bufp, n);
> > > +     alloc_tag_to_text(&buf, iter->ct);
> > > +     seq_commit(m, seq_buf_used(&buf));
> > > +     return 0;
> > > +}
> > > +
> > > +static const struct seq_operations allocinfo_seq_op = {
> > > +     .start  = allocinfo_start,
> > > +     .next   = allocinfo_next,
> > > +     .stop   = allocinfo_stop,
> > > +     .show   = allocinfo_show,
> > > +};
> > > +
> > > +static void __init procfs_init(void)
> > > +{
> > > +     proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> > > +}
> >
> > As mentioned, this really should be in /sys somewhere.
> 
> Ack.
> 
> >
> > > +
> > > +static bool alloc_tag_module_unload(struct codetag_type *cttype,
> > > +                                 struct codetag_module *cmod)
> > > +{
> > > +     struct codetag_iterator iter = codetag_get_ct_iter(cttype);
> > > +     struct alloc_tag_counters counter;
> > > +     bool module_unused = true;
> > > +     struct alloc_tag *tag;
> > > +     struct codetag *ct;
> > > +
> > > +     for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> > > +             if (iter.cmod != cmod)
> > > +                     continue;
> > > +
> > > +             tag = ct_to_alloc_tag(ct);
> > > +             counter = alloc_tag_read(tag);
> > > +
> > > +             if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload",
> > > +                       ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> > > +                     module_unused = false;
> > > +     }
> > > +
> > > +     return module_unused;
> > > +}
> > > +
> > > +static struct ctl_table memory_allocation_profiling_sysctls[] = {
> > > +     {
> > > +             .procname       = "mem_profiling",
> > > +             .data           = &mem_alloc_profiling_key,
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > +             .mode           = 0444,
> > > +#else
> > > +             .mode           = 0644,
> > > +#endif
> > > +             .proc_handler   = proc_do_static_key,
> > > +     },
> > > +     { }
> > > +};
> > > +
> > > +static int __init alloc_tag_init(void)
> > > +{
> > > +     const struct codetag_type_desc desc = {
> > > +             .section        = "alloc_tags",
> > > +             .tag_size       = sizeof(struct alloc_tag),
> > > +             .module_unload  = alloc_tag_module_unload,
> > > +     };
> > > +
> > > +     alloc_tag_cttype = codetag_register_type(&desc);
> > > +     if (IS_ERR_OR_NULL(alloc_tag_cttype))
> > > +             return PTR_ERR(alloc_tag_cttype);
> > > +
> > > +     register_sysctl_init("vm", memory_allocation_profiling_sysctls);
> > > +     procfs_init();
> > > +
> > > +     return 0;
> > > +}
> > > +module_init(alloc_tag_init);
> > > diff --git a/scripts/module.lds.S b/scripts/module.lds.S
> > > index bf5bcf2836d8..45c67a0994f3 100644
> > > --- a/scripts/module.lds.S
> > > +++ b/scripts/module.lds.S
> > > @@ -9,6 +9,8 @@
> > >  #define DISCARD_EH_FRAME     *(.eh_frame)
> > >  #endif
> > >
> > > +#include <asm-generic/codetag.lds.h>
> > > +
> > >  SECTIONS {
> > >       /DISCARD/ : {
> > >               *(.discard)
> > > @@ -47,12 +49,17 @@ SECTIONS {
> > >       .data : {
> > >               *(.data .data.[0-9a-zA-Z_]*)
> > >               *(.data..L*)
> > > +             CODETAG_SECTIONS()
> > >       }
> > >
> > >       .rodata : {
> > >               *(.rodata .rodata.[0-9a-zA-Z_]*)
> > >               *(.rodata..L*)
> > >       }
> > > +#else
> > > +     .data : {
> > > +             CODETAG_SECTIONS()
> > > +     }
> > >  #endif
> > >  }
> >
> > Otherwise, looks good.
> 
> Thanks!
> 
> >
> > --
> > Kees Cook
>
Suren Baghdasaryan Feb. 13, 2024, 10:35 p.m. UTC | #4
On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote:
>
> On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote:
> > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote:
> > >
> > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> > > > instrument memory allocators. It registers an "alloc_tags" codetag type
> > > > with /proc/allocinfo interface to output allocation tag information when
> > >
> > > Please don't add anything new to the top-level /proc directory. This
> > > should likely live in /sys.
> >
> > Ack. I'll find a more appropriate place for it then.
> > It just seemed like such generic information which would belong next
> > to meminfo/zoneinfo and such...
>
> Save yourself a cycle of "rework the whole fs interface only to have
> someone else tell you no" and put it in debugfs, not sysfs.  Wrangling
> with debugfs is easier than all the macro-happy sysfs stuff; you don't
> have to integrate with the "device" model; and there is no 'one value
> per file' rule.

Thanks for the input. This file used to be in debugfs but reviewers
felt it belonged in /proc if it's to be used in production
environments. Some distros (like Android) disable debugfs in
production.

>
> --D
>
> > >
> > > > the feature is enabled.
> > > > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> > > > allocation profiling instrumentation.
> > > > Memory allocation profiling can be enabled or disabled at runtime using
> > > > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> > > > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> > > > profiling by default.
> > > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
> > > > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > > > ---
> > > >  Documentation/admin-guide/sysctl/vm.rst |  16 +++
> > > >  Documentation/filesystems/proc.rst      |  28 +++++
> > > >  include/asm-generic/codetag.lds.h       |  14 +++
> > > >  include/asm-generic/vmlinux.lds.h       |   3 +
> > > >  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
> > > >  include/linux/sched.h                   |  24 ++++
> > > >  lib/Kconfig.debug                       |  25 ++++
> > > >  lib/Makefile                            |   2 +
> > > >  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
> > > >  scripts/module.lds.S                    |   7 ++
> > > >  10 files changed, 410 insertions(+)
> > > >  create mode 100644 include/asm-generic/codetag.lds.h
> > > >  create mode 100644 include/linux/alloc_tag.h
> > > >  create mode 100644 lib/alloc_tag.c
> > > >
> > > > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > > > index c59889de122b..a214719492ea 100644
> > > > --- a/Documentation/admin-guide/sysctl/vm.rst
> > > > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > > > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
> > > >  - legacy_va_layout
> > > >  - lowmem_reserve_ratio
> > > >  - max_map_count
> > > > +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
> > > >  - memory_failure_early_kill
> > > >  - memory_failure_recovery
> > > >  - min_free_kbytes
> > > > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
> > > >  The default value is 65530.
> > > >
> > > >
> > > > +mem_profiling
> > > > +==============
> > > > +
> > > > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> > > > +
> > > > +1: Enable memory profiling.
> > > > +
> > > > +0: Disabld memory profiling.
> > > > +
> > > > +Enabling memory profiling introduces a small performance overhead for all
> > > > +memory allocations.
> > > > +
> > > > +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
> > > > +
> > > > +
> > > >  memory_failure_early_kill:
> > > >  ==========================
> > > >
> > > > diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
> > > > index 104c6d047d9b..40d6d18308e4 100644
> > > > --- a/Documentation/filesystems/proc.rst
> > > > +++ b/Documentation/filesystems/proc.rst
> > > > @@ -688,6 +688,7 @@ files are there, and which are missing.
> > > >   ============ ===============================================================
> > > >   File         Content
> > > >   ============ ===============================================================
> > > > + allocinfo    Memory allocations profiling information
> > > >   apm          Advanced power management info
> > > >   bootconfig   Kernel command line obtained from boot config,
> > > >             and, if there were kernel parameters from the
> > > > @@ -953,6 +954,33 @@ also be allocatable although a lot of filesystem metadata may have to be
> > > >  reclaimed to achieve this.
> > > >
> > > >
> > > > +allocinfo
> > > > +~~~~~~~
> > > > +
> > > > +Provides information about memory allocations at all locations in the code
> > > > +base. Each allocation in the code is identified by its source file, line
> > > > +number, module and the function calling the allocation. The number of bytes
> > > > +allocated at each location is reported.
> > > > +
> > > > +Example output.
> > > > +
> > > > +::
> > > > +
> > > > +    > cat /proc/allocinfo
> > > > +
> > > > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > > > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > > > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > > > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > > > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > > > +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> > > > +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> > > > +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> > > > +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> > > > +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> > > > +      ...
> > > > +
> > > > +
> > > >  meminfo
> > > >  ~~~~~~~
> > > >
> > > > diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
> > > > new file mode 100644
> > > > index 000000000000..64f536b80380
> > > > --- /dev/null
> > > > +++ b/include/asm-generic/codetag.lds.h
> > > > @@ -0,0 +1,14 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > > +#ifndef __ASM_GENERIC_CODETAG_LDS_H
> > > > +#define __ASM_GENERIC_CODETAG_LDS_H
> > > > +
> > > > +#define SECTION_WITH_BOUNDARIES(_name)       \
> > > > +     . = ALIGN(8);                   \
> > > > +     __start_##_name = .;            \
> > > > +     KEEP(*(_name))                  \
> > > > +     __stop_##_name = .;
> > > > +
> > > > +#define CODETAG_SECTIONS()           \
> > > > +     SECTION_WITH_BOUNDARIES(alloc_tags)
> > > > +
> > > > +#endif /* __ASM_GENERIC_CODETAG_LDS_H */
> > > > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> > > > index 5dd3a61d673d..c9997dc50c50 100644
> > > > --- a/include/asm-generic/vmlinux.lds.h
> > > > +++ b/include/asm-generic/vmlinux.lds.h
> > > > @@ -50,6 +50,8 @@
> > > >   *               [__nosave_begin, __nosave_end] for the nosave data
> > > >   */
> > > >
> > > > +#include <asm-generic/codetag.lds.h>
> > > > +
> > > >  #ifndef LOAD_OFFSET
> > > >  #define LOAD_OFFSET 0
> > > >  #endif
> > > > @@ -366,6 +368,7 @@
> > > >       . = ALIGN(8);                                                   \
> > > >       BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)         \
> > > >       BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)                         \
> > > > +     CODETAG_SECTIONS()                                              \
> > > >       LIKELY_PROFILE()                                                \
> > > >       BRANCH_PROFILE()                                                \
> > > >       TRACE_PRINTKS()                                                 \
> > > > diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> > > > new file mode 100644
> > > > index 000000000000..cf55a149fa84
> > > > --- /dev/null
> > > > +++ b/include/linux/alloc_tag.h
> > > > @@ -0,0 +1,133 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +/*
> > > > + * allocation tagging
> > > > + */
> > > > +#ifndef _LINUX_ALLOC_TAG_H
> > > > +#define _LINUX_ALLOC_TAG_H
> > > > +
> > > > +#include <linux/bug.h>
> > > > +#include <linux/codetag.h>
> > > > +#include <linux/container_of.h>
> > > > +#include <linux/preempt.h>
> > > > +#include <asm/percpu.h>
> > > > +#include <linux/cpumask.h>
> > > > +#include <linux/static_key.h>
> > > > +
> > > > +struct alloc_tag_counters {
> > > > +     u64 bytes;
> > > > +     u64 calls;
> > > > +};
> > > > +
> > > > +/*
> > > > + * An instance of this structure is created in a special ELF section at every
> > > > + * allocation callsite. At runtime, the special section is treated as
> > > > + * an array of these. Embedded codetag utilizes codetag framework.
> > > > + */
> > > > +struct alloc_tag {
> > > > +     struct codetag                  ct;
> > > > +     struct alloc_tag_counters __percpu      *counters;
> > > > +} __aligned(8);
> > > > +
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > +
> > > > +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
> > > > +{
> > > > +     return container_of(ct, struct alloc_tag, ct);
> > > > +}
> > > > +
> > > > +#ifdef ARCH_NEEDS_WEAK_PER_CPU
> > > > +/*
> > > > + * When percpu variables are required to be defined as weak, static percpu
> > > > + * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
> > > > + */
> > > > +#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU"
> > >
> > > Is this enforced via Kconfig as well? (Looks like only alpha and s390?)
> >
> > Unfortunately ARCH_NEEDS_WEAK_PER_CPU is not a Kconfig option but
> > CONFIG_DEBUG_FORCE_WEAK_PER_CPU is, so that one is handled via Kconfig
> > (see "depends on !DEBUG_FORCE_WEAK_PER_CPU" in this patch). We have to
> > avoid both cases because of this:
> > https://elixir.bootlin.com/linux/latest/source/include/linux/percpu-defs.h#L75,
> > so I'm trying to provide an informative error here.
> >
> > >
> > > > +#endif
> > > > +
> > > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)                                   \
> > > > +     static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);      \
> > > > +     static struct alloc_tag _alloc_tag __used __aligned(8)                  \
> > > > +     __section("alloc_tags") = {                                             \
> > > > +             .ct = CODE_TAG_INIT,                                            \
> > > > +             .counters = &_alloc_tag_cntr };                                 \
> > > > +     struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
> > > > +
> > > > +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > > > +                     mem_alloc_profiling_key);
> > > > +
> > > > +static inline bool mem_alloc_profiling_enabled(void)
> > > > +{
> > > > +     return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > > > +                                &mem_alloc_profiling_key);
> > > > +}
> > > > +
> > > > +static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
> > > > +{
> > > > +     struct alloc_tag_counters v = { 0, 0 };
> > > > +     struct alloc_tag_counters *counter;
> > > > +     int cpu;
> > > > +
> > > > +     for_each_possible_cpu(cpu) {
> > > > +             counter = per_cpu_ptr(tag->counters, cpu);
> > > > +             v.bytes += counter->bytes;
> > > > +             v.calls += counter->calls;
> > > > +     }
> > > > +
> > > > +     return v;
> > > > +}
> > > > +
> > > > +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> > > > +{
> > > > +     struct alloc_tag *tag;
> > > > +
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > > +     WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
> > > > +#endif
> > > > +     if (!ref || !ref->ct)
> > > > +             return;
> > > > +
> > > > +     tag = ct_to_alloc_tag(ref->ct);
> > > > +
> > > > +     this_cpu_sub(tag->counters->bytes, bytes);
> > > > +     this_cpu_dec(tag->counters->calls);
> > > > +
> > > > +     ref->ct = NULL;
> > > > +}
> > > > +
> > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
> > > > +{
> > > > +     __alloc_tag_sub(ref, bytes);
> > > > +}
> > > > +
> > > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
> > > > +{
> > > > +     __alloc_tag_sub(ref, bytes);
> > > > +}
> > > > +
> > > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
> > > > +{
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > > +     WARN_ONCE(ref && ref->ct,
> > > > +               "alloc_tag was not cleared (got tag for %s:%u)\n",\
> > > > +               ref->ct->filename, ref->ct->lineno);
> > > > +
> > > > +     WARN_ONCE(!tag, "current->alloc_tag not set");
> > > > +#endif
> > > > +     if (!ref || !tag)
> > > > +             return;
> > > > +
> > > > +     ref->ct = &tag->ct;
> > > > +     this_cpu_add(tag->counters->bytes, bytes);
> > > > +     this_cpu_inc(tag->counters->calls);
> > > > +}
> > > > +
> > > > +#else
> > > > +
> > > > +#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
> > > > +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
> > > > +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
> > > > +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
> > > > +                              size_t bytes) {}
> > > > +
> > > > +#endif
> > > > +
> > > > +#endif /* _LINUX_ALLOC_TAG_H */
> > > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > > index ffe8f618ab86..da68a10517c8 100644
> > > > --- a/include/linux/sched.h
> > > > +++ b/include/linux/sched.h
> > > > @@ -770,6 +770,10 @@ struct task_struct {
> > > >       unsigned int                    flags;
> > > >       unsigned int                    ptrace;
> > > >
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > +     struct alloc_tag                *alloc_tag;
> > > > +#endif
> > >
> > > Normally scheduling is very sensitive to having anything early in
> > > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
> > > area.
> >
> > Thanks for the warning! We will look into that.
> >
> > >
> > > > +
> > > >  #ifdef CONFIG_SMP
> > > >       int                             on_cpu;
> > > >       struct __call_single_node       wake_entry;
> > > > @@ -810,6 +814,7 @@ struct task_struct {
> > > >       struct task_group               *sched_task_group;
> > > >  #endif
> > > >
> > > > +
> > > >  #ifdef CONFIG_UCLAMP_TASK
> > > >       /*
> > > >        * Clamp values requested for a scheduling entity.
> > > > @@ -2183,4 +2188,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
> > > >
> > > >  extern void sched_set_stop_task(int cpu, struct task_struct *stop);
> > > >
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
> > > > +{
> > > > +     swap(current->alloc_tag, tag);
> > > > +     return tag;
> > > > +}
> > > > +
> > > > +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
> > > > +{
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > > +     WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
> > > > +#endif
> > > > +     current->alloc_tag = old;
> > > > +}
> > > > +#else
> > > > +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
> > > > +#define alloc_tag_restore(_tag, _old)
> > > > +#endif
> > > > +
> > > >  #endif
> > > > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > > > index 0be2d00c3696..78d258ca508f 100644
> > > > --- a/lib/Kconfig.debug
> > > > +++ b/lib/Kconfig.debug
> > > > @@ -972,6 +972,31 @@ config CODE_TAGGING
> > > >       bool
> > > >       select KALLSYMS
> > > >
> > > > +config MEM_ALLOC_PROFILING
> > > > +     bool "Enable memory allocation profiling"
> > > > +     default n
> > > > +     depends on PROC_FS
> > > > +     depends on !DEBUG_FORCE_WEAK_PER_CPU
> > > > +     select CODE_TAGGING
> > > > +     help
> > > > +       Track allocation source code and record total allocation size
> > > > +       initiated at that code location. The mechanism can be used to track
> > > > +       memory leaks with a low performance and memory impact.
> > > > +
> > > > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > > > +     bool "Enable memory allocation profiling by default"
> > > > +     default y
> > > > +     depends on MEM_ALLOC_PROFILING
> > > > +
> > > > +config MEM_ALLOC_PROFILING_DEBUG
> > > > +     bool "Memory allocation profiler debugging"
> > > > +     default n
> > > > +     depends on MEM_ALLOC_PROFILING
> > > > +     select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > > > +     help
> > > > +       Adds warnings with helpful error messages for memory allocation
> > > > +       profiling.
> > > > +
> > > >  source "lib/Kconfig.kasan"
> > > >  source "lib/Kconfig.kfence"
> > > >  source "lib/Kconfig.kmsan"
> > > > diff --git a/lib/Makefile b/lib/Makefile
> > > > index 6b48b22fdfac..859112f09bf5 100644
> > > > --- a/lib/Makefile
> > > > +++ b/lib/Makefile
> > > > @@ -236,6 +236,8 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
> > > >  obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> > > >
> > > >  obj-$(CONFIG_CODE_TAGGING) += codetag.o
> > > > +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
> > > > +
> > > >  lib-$(CONFIG_GENERIC_BUG) += bug.o
> > > >
> > > >  obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
> > > > diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> > > > new file mode 100644
> > > > index 000000000000..4fc031f9cefd
> > > > --- /dev/null
> > > > +++ b/lib/alloc_tag.c
> > > > @@ -0,0 +1,158 @@
> > > > +// SPDX-License-Identifier: GPL-2.0-only
> > > > +#include <linux/alloc_tag.h>
> > > > +#include <linux/fs.h>
> > > > +#include <linux/gfp.h>
> > > > +#include <linux/module.h>
> > > > +#include <linux/proc_fs.h>
> > > > +#include <linux/seq_buf.h>
> > > > +#include <linux/seq_file.h>
> > > > +
> > > > +static struct codetag_type *alloc_tag_cttype;
> > > > +
> > > > +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
> > > > +                     mem_alloc_profiling_key);
> > > > +
> > > > +static void *allocinfo_start(struct seq_file *m, loff_t *pos)
> > > > +{
> > > > +     struct codetag_iterator *iter;
> > > > +     struct codetag *ct;
> > > > +     loff_t node = *pos;
> > > > +
> > > > +     iter = kzalloc(sizeof(*iter), GFP_KERNEL);
> > > > +     m->private = iter;
> > > > +     if (!iter)
> > > > +             return NULL;
> > > > +
> > > > +     codetag_lock_module_list(alloc_tag_cttype, true);
> > > > +     *iter = codetag_get_ct_iter(alloc_tag_cttype);
> > > > +     while ((ct = codetag_next_ct(iter)) != NULL && node)
> > > > +             node--;
> > > > +
> > > > +     return ct ? iter : NULL;
> > > > +}
> > > > +
> > > > +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
> > > > +{
> > > > +     struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> > > > +     struct codetag *ct = codetag_next_ct(iter);
> > > > +
> > > > +     (*pos)++;
> > > > +     if (!ct)
> > > > +             return NULL;
> > > > +
> > > > +     return iter;
> > > > +}
> > > > +
> > > > +static void allocinfo_stop(struct seq_file *m, void *arg)
> > > > +{
> > > > +     struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
> > > > +
> > > > +     if (iter) {
> > > > +             codetag_lock_module_list(alloc_tag_cttype, false);
> > > > +             kfree(iter);
> > > > +     }
> > > > +}
> > > > +
> > > > +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
> > > > +{
> > > > +     struct alloc_tag *tag = ct_to_alloc_tag(ct);
> > > > +     struct alloc_tag_counters counter = alloc_tag_read(tag);
> > > > +     s64 bytes = counter.bytes;
> > > > +     char val[10], *p = val;
> > > > +
> > > > +     if (bytes < 0) {
> > > > +             *p++ = '-';
> > > > +             bytes = -bytes;
> > > > +     }
> > > > +
> > > > +     string_get_size(bytes, 1,
> > > > +                     STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
> > > > +                     p, val + ARRAY_SIZE(val) - p);
> > > > +
> > > > +     seq_buf_printf(out, "%8s %8llu ", val, counter.calls);
> > > > +     codetag_to_text(out, ct);
> > > > +     seq_buf_putc(out, ' ');
> > > > +     seq_buf_putc(out, '\n');
> > > > +}
> > >
> > > /me does happy seq_buf dance!
> > >
> > > > +
> > > > +static int allocinfo_show(struct seq_file *m, void *arg)
> > > > +{
> > > > +     struct codetag_iterator *iter = (struct codetag_iterator *)arg;
> > > > +     char *bufp;
> > > > +     size_t n = seq_get_buf(m, &bufp);
> > > > +     struct seq_buf buf;
> > > > +
> > > > +     seq_buf_init(&buf, bufp, n);
> > > > +     alloc_tag_to_text(&buf, iter->ct);
> > > > +     seq_commit(m, seq_buf_used(&buf));
> > > > +     return 0;
> > > > +}
> > > > +
> > > > +static const struct seq_operations allocinfo_seq_op = {
> > > > +     .start  = allocinfo_start,
> > > > +     .next   = allocinfo_next,
> > > > +     .stop   = allocinfo_stop,
> > > > +     .show   = allocinfo_show,
> > > > +};
> > > > +
> > > > +static void __init procfs_init(void)
> > > > +{
> > > > +     proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
> > > > +}
> > >
> > > As mentioned, this really should be in /sys somewhere.
> >
> > Ack.
> >
> > >
> > > > +
> > > > +static bool alloc_tag_module_unload(struct codetag_type *cttype,
> > > > +                                 struct codetag_module *cmod)
> > > > +{
> > > > +     struct codetag_iterator iter = codetag_get_ct_iter(cttype);
> > > > +     struct alloc_tag_counters counter;
> > > > +     bool module_unused = true;
> > > > +     struct alloc_tag *tag;
> > > > +     struct codetag *ct;
> > > > +
> > > > +     for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> > > > +             if (iter.cmod != cmod)
> > > > +                     continue;
> > > > +
> > > > +             tag = ct_to_alloc_tag(ct);
> > > > +             counter = alloc_tag_read(tag);
> > > > +
> > > > +             if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload",
> > > > +                       ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> > > > +                     module_unused = false;
> > > > +     }
> > > > +
> > > > +     return module_unused;
> > > > +}
> > > > +
> > > > +static struct ctl_table memory_allocation_profiling_sysctls[] = {
> > > > +     {
> > > > +             .procname       = "mem_profiling",
> > > > +             .data           = &mem_alloc_profiling_key,
> > > > +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
> > > > +             .mode           = 0444,
> > > > +#else
> > > > +             .mode           = 0644,
> > > > +#endif
> > > > +             .proc_handler   = proc_do_static_key,
> > > > +     },
> > > > +     { }
> > > > +};
> > > > +
> > > > +static int __init alloc_tag_init(void)
> > > > +{
> > > > +     const struct codetag_type_desc desc = {
> > > > +             .section        = "alloc_tags",
> > > > +             .tag_size       = sizeof(struct alloc_tag),
> > > > +             .module_unload  = alloc_tag_module_unload,
> > > > +     };
> > > > +
> > > > +     alloc_tag_cttype = codetag_register_type(&desc);
> > > > +     if (IS_ERR_OR_NULL(alloc_tag_cttype))
> > > > +             return PTR_ERR(alloc_tag_cttype);
> > > > +
> > > > +     register_sysctl_init("vm", memory_allocation_profiling_sysctls);
> > > > +     procfs_init();
> > > > +
> > > > +     return 0;
> > > > +}
> > > > +module_init(alloc_tag_init);
> > > > diff --git a/scripts/module.lds.S b/scripts/module.lds.S
> > > > index bf5bcf2836d8..45c67a0994f3 100644
> > > > --- a/scripts/module.lds.S
> > > > +++ b/scripts/module.lds.S
> > > > @@ -9,6 +9,8 @@
> > > >  #define DISCARD_EH_FRAME     *(.eh_frame)
> > > >  #endif
> > > >
> > > > +#include <asm-generic/codetag.lds.h>
> > > > +
> > > >  SECTIONS {
> > > >       /DISCARD/ : {
> > > >               *(.discard)
> > > > @@ -47,12 +49,17 @@ SECTIONS {
> > > >       .data : {
> > > >               *(.data .data.[0-9a-zA-Z_]*)
> > > >               *(.data..L*)
> > > > +             CODETAG_SECTIONS()
> > > >       }
> > > >
> > > >       .rodata : {
> > > >               *(.rodata .rodata.[0-9a-zA-Z_]*)
> > > >               *(.rodata..L*)
> > > >       }
> > > > +#else
> > > > +     .data : {
> > > > +             CODETAG_SECTIONS()
> > > > +     }
> > > >  #endif
> > > >  }
> > >
> > > Otherwise, looks good.
> >
> > Thanks!
> >
> > >
> > > --
> > > Kees Cook
> >
Kees Cook Feb. 13, 2024, 10:38 p.m. UTC | #5
On Tue, Feb 13, 2024 at 02:35:29PM -0800, Suren Baghdasaryan wrote:
> On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >
> > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote:
> > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote:
> > > >
> > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> > > > > instrument memory allocators. It registers an "alloc_tags" codetag type
> > > > > with /proc/allocinfo interface to output allocation tag information when
> > > >
> > > > Please don't add anything new to the top-level /proc directory. This
> > > > should likely live in /sys.
> > >
> > > Ack. I'll find a more appropriate place for it then.
> > > It just seemed like such generic information which would belong next
> > > to meminfo/zoneinfo and such...
> >
> > Save yourself a cycle of "rework the whole fs interface only to have
> > someone else tell you no" and put it in debugfs, not sysfs.  Wrangling
> > with debugfs is easier than all the macro-happy sysfs stuff; you don't
> > have to integrate with the "device" model; and there is no 'one value
> > per file' rule.
> 
> Thanks for the input. This file used to be in debugfs but reviewers
> felt it belonged in /proc if it's to be used in production
> environments. Some distros (like Android) disable debugfs in
> production.

FWIW, I agree debugfs is not right. If others feel it's right in /proc,
I certainly won't NAK -- it's just been that we've traditionally been
trying to avoid continuing to pollute the top-level /proc and instead
associate new things with something in /sys.
Steven Rostedt Feb. 13, 2024, 10:47 p.m. UTC | #6
On Tue, 13 Feb 2024 14:38:16 -0800
Kees Cook <keescook@chromium.org> wrote:

> > > Save yourself a cycle of "rework the whole fs interface only to have
> > > someone else tell you no" and put it in debugfs, not sysfs.  Wrangling
> > > with debugfs is easier than all the macro-happy sysfs stuff; you don't
> > > have to integrate with the "device" model; and there is no 'one value
> > > per file' rule.  
> > 
> > Thanks for the input. This file used to be in debugfs but reviewers
> > felt it belonged in /proc if it's to be used in production
> > environments. Some distros (like Android) disable debugfs in
> > production.  
> 
> FWIW, I agree debugfs is not right. If others feel it's right in /proc,
> I certainly won't NAK -- it's just been that we've traditionally been
> trying to avoid continuing to pollute the top-level /proc and instead
> associate new things with something in /sys.

You can create your own file system, but I would suggest using kernfs for it ;-)

If you look in /sys/kernel/ you'll see a bunch of kernel file systems already there:

 ~# mount |grep kernel
 securityfs on /sys/kernel/security type securityfs (rw,nosuid,nodev,noexec,relatime)
 debugfs on /sys/kernel/debug type debugfs (rw,nosuid,nodev,noexec,relatime)
 tracefs on /sys/kernel/tracing type tracefs (rw,nosuid,nodev,noexec,relatime)
 configfs on /sys/kernel/config type configfs (rw,nosuid,nodev,noexec,relatime)

-- Steve
Andrew Morton Feb. 16, 2024, 12:54 a.m. UTC | #7
On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote:

> +Example output.
> +
> +::
> +
> +    > cat /proc/allocinfo
> +
> +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one

I don't really like the fancy MiB stuff.  Wouldn't it be better to just
present the amount of memory in plain old bytes, so people can use sort
-n on it?  And it's easier to tell big-from-small at a glance because
big has more digits.

Also, the first thing any sort of downstream processing of this data is
going to have to do is to convert the fancified output back into
plain-old-bytes.  So why not just emit plain-old-bytes?

If someone wants the fancy output (and nobody does) then that can be
done in userspace.
Kent Overstreet Feb. 16, 2024, 1 a.m. UTC | #8
On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote:
> On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> 
> > +Example output.
> > +
> > +::
> > +
> > +    > cat /proc/allocinfo
> > +
> > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> 
> I don't really like the fancy MiB stuff.  Wouldn't it be better to just
> present the amount of memory in plain old bytes, so people can use sort
> -n on it?

They can use sort -h on it; the string_get_size() patch was specifically
so that we could make the output compatible with sort -h

> And it's easier to tell big-from-small at a glance because
> big has more digits.
> 
> Also, the first thing any sort of downstream processing of this data is
> going to have to do is to convert the fancified output back into
> plain-old-bytes.  So why not just emit plain-old-bytes?
> 
> If someone wants the fancy output (and nobody does) then that can be
> done in userspace.

I like simpler, more discoverable tools; e.g. we've got a bunch of
interesting stuff in scripts/ but it doesn't get used nearly as much -
not as accessible as cat'ing a file, definitely not going to be
installed by default.

I'm just optimizing for the most common use case. I doubt there's going
to be nearly as much consumption by tools, and I'm ok with making them
do the conversion back to bytes if they really need it.
Pasha Tatashin Feb. 16, 2024, 1:22 a.m. UTC | #9
On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet
<kent.overstreet@linux.dev> wrote:
>
> On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote:
> > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > > +Example output.
> > > +
> > > +::
> > > +
> > > +    > cat /proc/allocinfo
> > > +
> > > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> >
> > I don't really like the fancy MiB stuff.  Wouldn't it be better to just
> > present the amount of memory in plain old bytes, so people can use sort
> > -n on it?
>
> They can use sort -h on it; the string_get_size() patch was specifically
> so that we could make the output compatible with sort -h
>
> > And it's easier to tell big-from-small at a glance because
> > big has more digits.
> >
> > Also, the first thing any sort of downstream processing of this data is
> > going to have to do is to convert the fancified output back into
> > plain-old-bytes.  So why not just emit plain-old-bytes?
> >
> > If someone wants the fancy output (and nobody does) then that can be
> > done in userspace.
>
> I like simpler, more discoverable tools; e.g. we've got a bunch of
> interesting stuff in scripts/ but it doesn't get used nearly as much -
> not as accessible as cat'ing a file, definitely not going to be
> installed by default.

I also prefer plain bytes instead of MiB. A driver developer that
wants to verify up-to the byte allocations for a new data structure
that they added is going to be disappointed by the rounded MiB
numbers.

The data contained in this file is not consumable without at least
"sort -h -r", so why not just output bytes instead?

There is /proc/slabinfo  and there is a slabtop tool.
For raw /proc/allocinfo we can create an alloctop tool that would
parse, sort and show data in human readable format based on various
criteria.

We should also add at the top of this file "allocinfo - version: 1.0",
to allow future extensions (i.e. column for proc name).

Pasha
Kent Overstreet Feb. 16, 2024, 1:27 a.m. UTC | #10
On Thu, Feb 15, 2024 at 08:22:44PM -0500, Pasha Tatashin wrote:
> On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet
> <kent.overstreet@linux.dev> wrote:
> >
> > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote:
> > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > > +Example output.
> > > > +
> > > > +::
> > > > +
> > > > +    > cat /proc/allocinfo
> > > > +
> > > > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > > > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > > > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > > > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > > > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > >
> > > I don't really like the fancy MiB stuff.  Wouldn't it be better to just
> > > present the amount of memory in plain old bytes, so people can use sort
> > > -n on it?
> >
> > They can use sort -h on it; the string_get_size() patch was specifically
> > so that we could make the output compatible with sort -h
> >
> > > And it's easier to tell big-from-small at a glance because
> > > big has more digits.
> > >
> > > Also, the first thing any sort of downstream processing of this data is
> > > going to have to do is to convert the fancified output back into
> > > plain-old-bytes.  So why not just emit plain-old-bytes?
> > >
> > > If someone wants the fancy output (and nobody does) then that can be
> > > done in userspace.
> >
> > I like simpler, more discoverable tools; e.g. we've got a bunch of
> > interesting stuff in scripts/ but it doesn't get used nearly as much -
> > not as accessible as cat'ing a file, definitely not going to be
> > installed by default.
> 
> I also prefer plain bytes instead of MiB. A driver developer that
> wants to verify up-to the byte allocations for a new data structure
> that they added is going to be disappointed by the rounded MiB
> numbers.

That's a fair point.

> The data contained in this file is not consumable without at least
> "sort -h -r", so why not just output bytes instead?
> 
> There is /proc/slabinfo  and there is a slabtop tool.
> For raw /proc/allocinfo we can create an alloctop tool that would
> parse, sort and show data in human readable format based on various
> criteria.
> 
> We should also add at the top of this file "allocinfo - version: 1.0",
> to allow future extensions (i.e. column for proc name).

How would we feel about exposing two different versions in /proc? It
should be a pretty minimal addition to .text.

Personally, I hate trying to count long strings digits by eyeball...
Vlastimil Babka Feb. 16, 2024, 8:50 a.m. UTC | #11
On 2/13/24 23:38, Kees Cook wrote:
> On Tue, Feb 13, 2024 at 02:35:29PM -0800, Suren Baghdasaryan wrote:
>> On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote:
>> >
>> > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote:
>> > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote:
>> > > >
>> > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
>> > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
>> > > > > instrument memory allocators. It registers an "alloc_tags" codetag type
>> > > > > with /proc/allocinfo interface to output allocation tag information when
>> > > >
>> > > > Please don't add anything new to the top-level /proc directory. This
>> > > > should likely live in /sys.
>> > >
>> > > Ack. I'll find a more appropriate place for it then.
>> > > It just seemed like such generic information which would belong next
>> > > to meminfo/zoneinfo and such...
>> >
>> > Save yourself a cycle of "rework the whole fs interface only to have
>> > someone else tell you no" and put it in debugfs, not sysfs.  Wrangling
>> > with debugfs is easier than all the macro-happy sysfs stuff; you don't
>> > have to integrate with the "device" model; and there is no 'one value
>> > per file' rule.
>> 
>> Thanks for the input. This file used to be in debugfs but reviewers
>> felt it belonged in /proc if it's to be used in production
>> environments. Some distros (like Android) disable debugfs in
>> production.
> 
> FWIW, I agree debugfs is not right. If others feel it's right in /proc,
> I certainly won't NAK -- it's just been that we've traditionally been
> trying to avoid continuing to pollute the top-level /proc and instead
> associate new things with something in /sys.

Sysfs is really a "one value per file" thing though. /proc might be ok for a
single overview file.
Suren Baghdasaryan Feb. 16, 2024, 8:55 a.m. UTC | #12
On Fri, Feb 16, 2024 at 12:50 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/13/24 23:38, Kees Cook wrote:
> > On Tue, Feb 13, 2024 at 02:35:29PM -0800, Suren Baghdasaryan wrote:
> >> On Tue, Feb 13, 2024 at 2:29 PM Darrick J. Wong <djwong@kernel.org> wrote:
> >> >
> >> > On Mon, Feb 12, 2024 at 05:01:19PM -0800, Suren Baghdasaryan wrote:
> >> > > On Mon, Feb 12, 2024 at 2:40 PM Kees Cook <keescook@chromium.org> wrote:
> >> > > >
> >> > > > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> >> > > > > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> >> > > > > instrument memory allocators. It registers an "alloc_tags" codetag type
> >> > > > > with /proc/allocinfo interface to output allocation tag information when
> >> > > >
> >> > > > Please don't add anything new to the top-level /proc directory. This
> >> > > > should likely live in /sys.
> >> > >
> >> > > Ack. I'll find a more appropriate place for it then.
> >> > > It just seemed like such generic information which would belong next
> >> > > to meminfo/zoneinfo and such...
> >> >
> >> > Save yourself a cycle of "rework the whole fs interface only to have
> >> > someone else tell you no" and put it in debugfs, not sysfs.  Wrangling
> >> > with debugfs is easier than all the macro-happy sysfs stuff; you don't
> >> > have to integrate with the "device" model; and there is no 'one value
> >> > per file' rule.
> >>
> >> Thanks for the input. This file used to be in debugfs but reviewers
> >> felt it belonged in /proc if it's to be used in production
> >> environments. Some distros (like Android) disable debugfs in
> >> production.
> >
> > FWIW, I agree debugfs is not right. If others feel it's right in /proc,
> > I certainly won't NAK -- it's just been that we've traditionally been
> > trying to avoid continuing to pollute the top-level /proc and instead
> > associate new things with something in /sys.
>
> Sysfs is really a "one value per file" thing though. /proc might be ok for a
> single overview file.

I'm preparing v4 and will keep the file it under /proc for now unless
there are strong objections.
Vlastimil Babka Feb. 16, 2024, 8:57 a.m. UTC | #13
On 2/12/24 22:38, Suren Baghdasaryan wrote:
> Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> instrument memory allocators. It registers an "alloc_tags" codetag type
> with /proc/allocinfo interface to output allocation tag information when
> the feature is enabled.
> CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> allocation profiling instrumentation.
> Memory allocation profiling can be enabled or disabled at runtime using
> /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> profiling by default.
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> ---
>  Documentation/admin-guide/sysctl/vm.rst |  16 +++
>  Documentation/filesystems/proc.rst      |  28 +++++
>  include/asm-generic/codetag.lds.h       |  14 +++
>  include/asm-generic/vmlinux.lds.h       |   3 +
>  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
>  include/linux/sched.h                   |  24 ++++
>  lib/Kconfig.debug                       |  25 ++++
>  lib/Makefile                            |   2 +
>  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
>  scripts/module.lds.S                    |   7 ++
>  10 files changed, 410 insertions(+)
>  create mode 100644 include/asm-generic/codetag.lds.h
>  create mode 100644 include/linux/alloc_tag.h
>  create mode 100644 lib/alloc_tag.c
> 
> diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> index c59889de122b..a214719492ea 100644
> --- a/Documentation/admin-guide/sysctl/vm.rst
> +++ b/Documentation/admin-guide/sysctl/vm.rst
> @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
>  - legacy_va_layout
>  - lowmem_reserve_ratio
>  - max_map_count
> +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
>  - memory_failure_early_kill
>  - memory_failure_recovery
>  - min_free_kbytes
> @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
>  The default value is 65530.
>  
>  
> +mem_profiling
> +==============
> +
> +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> +
> +1: Enable memory profiling.
> +
> +0: Disabld memory profiling.

      Disable

...

> +allocinfo
> +~~~~~~~
> +
> +Provides information about memory allocations at all locations in the code
> +base. Each allocation in the code is identified by its source file, line
> +number, module and the function calling the allocation. The number of bytes
> +allocated at each location is reported.

See, it even says "number of bytes" :)

> +
> +Example output.
> +
> +::
> +
> +    > cat /proc/allocinfo
> +
> +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page

Is "module" meant in the usual kernel module sense? In that case IIRC is
more common to annotate things e.g. [xfs] in case it's really a module, and
nothing if it's built it, such as slub. Is that "slub" simply derived from
"mm/slub.c"? Then it's just redundant?

> +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> +      ...
> +
> +
>  meminfo

...

> diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> index 0be2d00c3696..78d258ca508f 100644
> --- a/lib/Kconfig.debug
> +++ b/lib/Kconfig.debug
> @@ -972,6 +972,31 @@ config CODE_TAGGING
>  	bool
>  	select KALLSYMS
>  
> +config MEM_ALLOC_PROFILING
> +	bool "Enable memory allocation profiling"
> +	default n
> +	depends on PROC_FS
> +	depends on !DEBUG_FORCE_WEAK_PER_CPU
> +	select CODE_TAGGING
> +	help
> +	  Track allocation source code and record total allocation size
> +	  initiated at that code location. The mechanism can be used to track
> +	  memory leaks with a low performance and memory impact.
> +
> +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +	bool "Enable memory allocation profiling by default"
> +	default y

I'd go with default n as that I'd select for a general distro.

> +	depends on MEM_ALLOC_PROFILING
> +
> +config MEM_ALLOC_PROFILING_DEBUG
> +	bool "Memory allocation profiler debugging"
> +	default n
> +	depends on MEM_ALLOC_PROFILING
> +	select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> +	help
> +	  Adds warnings with helpful error messages for memory allocation
> +	  profiling.
> +
Suren Baghdasaryan Feb. 16, 2024, 9:02 a.m. UTC | #14
On Thu, Feb 15, 2024 at 5:27 PM Kent Overstreet
<kent.overstreet@linux.dev> wrote:
>
> On Thu, Feb 15, 2024 at 08:22:44PM -0500, Pasha Tatashin wrote:
> > On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet
> > <kent.overstreet@linux.dev> wrote:
> > >
> > > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote:
> > > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> > > >
> > > > > +Example output.
> > > > > +
> > > > > +::
> > > > > +
> > > > > +    > cat /proc/allocinfo
> > > > > +
> > > > > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > > > > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > > > > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > > > > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > > > > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > > >
> > > > I don't really like the fancy MiB stuff.  Wouldn't it be better to just
> > > > present the amount of memory in plain old bytes, so people can use sort
> > > > -n on it?
> > >
> > > They can use sort -h on it; the string_get_size() patch was specifically
> > > so that we could make the output compatible with sort -h
> > >
> > > > And it's easier to tell big-from-small at a glance because
> > > > big has more digits.
> > > >
> > > > Also, the first thing any sort of downstream processing of this data is
> > > > going to have to do is to convert the fancified output back into
> > > > plain-old-bytes.  So why not just emit plain-old-bytes?
> > > >
> > > > If someone wants the fancy output (and nobody does) then that can be
> > > > done in userspace.
> > >
> > > I like simpler, more discoverable tools; e.g. we've got a bunch of
> > > interesting stuff in scripts/ but it doesn't get used nearly as much -
> > > not as accessible as cat'ing a file, definitely not going to be
> > > installed by default.
> >
> > I also prefer plain bytes instead of MiB. A driver developer that
> > wants to verify up-to the byte allocations for a new data structure
> > that they added is going to be disappointed by the rounded MiB
> > numbers.
>
> That's a fair point.
>
> > The data contained in this file is not consumable without at least
> > "sort -h -r", so why not just output bytes instead?
> >
> > There is /proc/slabinfo  and there is a slabtop tool.
> > For raw /proc/allocinfo we can create an alloctop tool that would
> > parse, sort and show data in human readable format based on various
> > criteria.
> >
> > We should also add at the top of this file "allocinfo - version: 1.0",
> > to allow future extensions (i.e. column for proc name).
>
> How would we feel about exposing two different versions in /proc? It
> should be a pretty minimal addition to .text.
>
> Personally, I hate trying to count long strings digits by eyeball...

Maybe something like this work for everyone then?:

160432128 (153MiB)     mm/slub.c:1826 module:slub func:alloc_slab_page
Suren Baghdasaryan Feb. 16, 2024, 9:03 a.m. UTC | #15
On Fri, Feb 16, 2024 at 1:02 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Thu, Feb 15, 2024 at 5:27 PM Kent Overstreet
> <kent.overstreet@linux.dev> wrote:
> >
> > On Thu, Feb 15, 2024 at 08:22:44PM -0500, Pasha Tatashin wrote:
> > > On Thu, Feb 15, 2024 at 8:00 PM Kent Overstreet
> > > <kent.overstreet@linux.dev> wrote:
> > > >
> > > > On Thu, Feb 15, 2024 at 04:54:38PM -0800, Andrew Morton wrote:
> > > > > On Mon, 12 Feb 2024 13:38:59 -0800 Suren Baghdasaryan <surenb@google.com> wrote:
> > > > >
> > > > > > +Example output.
> > > > > > +
> > > > > > +::
> > > > > > +
> > > > > > +    > cat /proc/allocinfo
> > > > > > +
> > > > > > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
> > > > > > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > > > > > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > > > > > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > > > > > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > > > >
> > > > > I don't really like the fancy MiB stuff.  Wouldn't it be better to just
> > > > > present the amount of memory in plain old bytes, so people can use sort
> > > > > -n on it?
> > > >
> > > > They can use sort -h on it; the string_get_size() patch was specifically
> > > > so that we could make the output compatible with sort -h
> > > >
> > > > > And it's easier to tell big-from-small at a glance because
> > > > > big has more digits.
> > > > >
> > > > > Also, the first thing any sort of downstream processing of this data is
> > > > > going to have to do is to convert the fancified output back into
> > > > > plain-old-bytes.  So why not just emit plain-old-bytes?
> > > > >
> > > > > If someone wants the fancy output (and nobody does) then that can be
> > > > > done in userspace.
> > > >
> > > > I like simpler, more discoverable tools; e.g. we've got a bunch of
> > > > interesting stuff in scripts/ but it doesn't get used nearly as much -
> > > > not as accessible as cat'ing a file, definitely not going to be
> > > > installed by default.
> > >
> > > I also prefer plain bytes instead of MiB. A driver developer that
> > > wants to verify up-to the byte allocations for a new data structure
> > > that they added is going to be disappointed by the rounded MiB
> > > numbers.
> >
> > That's a fair point.
> >
> > > The data contained in this file is not consumable without at least
> > > "sort -h -r", so why not just output bytes instead?
> > >
> > > There is /proc/slabinfo  and there is a slabtop tool.
> > > For raw /proc/allocinfo we can create an alloctop tool that would
> > > parse, sort and show data in human readable format based on various
> > > criteria.
> > >
> > > We should also add at the top of this file "allocinfo - version: 1.0",
> > > to allow future extensions (i.e. column for proc name).
> >
> > How would we feel about exposing two different versions in /proc? It
> > should be a pretty minimal addition to .text.
> >
> > Personally, I hate trying to count long strings digits by eyeball...
>
> Maybe something like this work for everyone then?:

s/work/would work

making too many mistakes. time for bed...

>
> 160432128 (153MiB)     mm/slub.c:1826 module:slub func:alloc_slab_page
Pasha Tatashin Feb. 16, 2024, 5:18 p.m. UTC | #16
> > Personally, I hate trying to count long strings digits by eyeball...
>
> Maybe something like this work for everyone then?:
>
> 160432128 (153MiB)     mm/slub.c:1826 module:slub func:alloc_slab_page

That would be even harder to parse.

This one liner should converts bytes to human readable size:
sort -rn /proc/allocinfo | numfmt --to=iec

Also, a "alloctop" script that would auto-update the current top
allocators would be useful to put in tools/mm/

Pasha
Kent Overstreet Feb. 16, 2024, 11:26 p.m. UTC | #17
On Mon, Feb 12, 2024 at 02:40:12PM -0800, Kees Cook wrote:
> On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index ffe8f618ab86..da68a10517c8 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -770,6 +770,10 @@ struct task_struct {
> >  	unsigned int			flags;
> >  	unsigned int			ptrace;
> >  
> > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > +	struct alloc_tag		*alloc_tag;
> > +#endif
> 
> Normally scheduling is very sensitive to having anything early in
> task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
> area.

This is even hotter than the scheduler members; we actually do want it
up front.
Kees Cook Feb. 17, 2024, 12:08 a.m. UTC | #18
On Fri, Feb 16, 2024 at 06:26:06PM -0500, Kent Overstreet wrote:
> On Mon, Feb 12, 2024 at 02:40:12PM -0800, Kees Cook wrote:
> > On Mon, Feb 12, 2024 at 01:38:59PM -0800, Suren Baghdasaryan wrote:
> > > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > > index ffe8f618ab86..da68a10517c8 100644
> > > --- a/include/linux/sched.h
> > > +++ b/include/linux/sched.h
> > > @@ -770,6 +770,10 @@ struct task_struct {
> > >  	unsigned int			flags;
> > >  	unsigned int			ptrace;
> > >  
> > > +#ifdef CONFIG_MEM_ALLOC_PROFILING
> > > +	struct alloc_tag		*alloc_tag;
> > > +#endif
> > 
> > Normally scheduling is very sensitive to having anything early in
> > task_struct. I would suggest moving this the CONFIG_SCHED_CORE ifdef
> > area.
> 
> This is even hotter than the scheduler members; we actually do want it
> up front.

It is? I would imagine the scheduler would touch stuff more than the
allocator, but whatever works. :)
Kent Overstreet Feb. 17, 2024, 8:10 p.m. UTC | #19
On Fri, Feb 16, 2024 at 12:18:09PM -0500, Pasha Tatashin wrote:
> > > Personally, I hate trying to count long strings digits by eyeball...
> >
> > Maybe something like this work for everyone then?:
> >
> > 160432128 (153MiB)     mm/slub.c:1826 module:slub func:alloc_slab_page
> 
> That would be even harder to parse.
> 
> This one liner should converts bytes to human readable size:
> sort -rn /proc/allocinfo | numfmt --to=iec

I like this, it doesn't print out that godawful kibibytes crap
Suren Baghdasaryan Feb. 18, 2024, 2:21 a.m. UTC | #20
On Fri, Feb 16, 2024 at 8:57 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/12/24 22:38, Suren Baghdasaryan wrote:
> > Introduce CONFIG_MEM_ALLOC_PROFILING which provides definitions to easily
> > instrument memory allocators. It registers an "alloc_tags" codetag type
> > with /proc/allocinfo interface to output allocation tag information when
> > the feature is enabled.
> > CONFIG_MEM_ALLOC_PROFILING_DEBUG is provided for debugging the memory
> > allocation profiling instrumentation.
> > Memory allocation profiling can be enabled or disabled at runtime using
> > /proc/sys/vm/mem_profiling sysctl when CONFIG_MEM_ALLOC_PROFILING_DEBUG=n.
> > CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT enables memory allocation
> > profiling by default.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > Co-developed-by: Kent Overstreet <kent.overstreet@linux.dev>
> > Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
> > ---
> >  Documentation/admin-guide/sysctl/vm.rst |  16 +++
> >  Documentation/filesystems/proc.rst      |  28 +++++
> >  include/asm-generic/codetag.lds.h       |  14 +++
> >  include/asm-generic/vmlinux.lds.h       |   3 +
> >  include/linux/alloc_tag.h               | 133 ++++++++++++++++++++
> >  include/linux/sched.h                   |  24 ++++
> >  lib/Kconfig.debug                       |  25 ++++
> >  lib/Makefile                            |   2 +
> >  lib/alloc_tag.c                         | 158 ++++++++++++++++++++++++
> >  scripts/module.lds.S                    |   7 ++
> >  10 files changed, 410 insertions(+)
> >  create mode 100644 include/asm-generic/codetag.lds.h
> >  create mode 100644 include/linux/alloc_tag.h
> >  create mode 100644 lib/alloc_tag.c
> >
> > diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
> > index c59889de122b..a214719492ea 100644
> > --- a/Documentation/admin-guide/sysctl/vm.rst
> > +++ b/Documentation/admin-guide/sysctl/vm.rst
> > @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm:
> >  - legacy_va_layout
> >  - lowmem_reserve_ratio
> >  - max_map_count
> > +- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
> >  - memory_failure_early_kill
> >  - memory_failure_recovery
> >  - min_free_kbytes
> > @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation.
> >  The default value is 65530.
> >
> >
> > +mem_profiling
> > +==============
> > +
> > +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
> > +
> > +1: Enable memory profiling.
> > +
> > +0: Disabld memory profiling.
>
>       Disable

Ack.

>
> ...
>
> > +allocinfo
> > +~~~~~~~
> > +
> > +Provides information about memory allocations at all locations in the code
> > +base. Each allocation in the code is identified by its source file, line
> > +number, module and the function calling the allocation. The number of bytes
> > +allocated at each location is reported.
>
> See, it even says "number of bytes" :)

Yes, we are changing the output to bytes.

>
> > +
> > +Example output.
> > +
> > +::
> > +
> > +    > cat /proc/allocinfo
> > +
> > +      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
>
> Is "module" meant in the usual kernel module sense? In that case IIRC is
> more common to annotate things e.g. [xfs] in case it's really a module, and
> nothing if it's built it, such as slub. Is that "slub" simply derived from
> "mm/slub.c"? Then it's just redundant?

Sounds good. The new example would look like this:

    > sort -rn /proc/allocinfo
   127664128    31168 mm/page_ext.c:270 func:alloc_page_ext
    56373248     4737 mm/slub.c:2259 func:alloc_slab_page
    14880768     3633 mm/readahead.c:247 func:page_cache_ra_unbounded
    14417920     3520 mm/mm_init.c:2530 func:alloc_large_system_hash
    13377536      234 block/blk-mq.c:3421 func:blk_mq_alloc_rqs
    11718656     2861 mm/filemap.c:1919 func:__filemap_get_folio
     9192960     2800 kernel/fork.c:307 func:alloc_thread_stack_node
     4206592        4 net/netfilter/nf_conntrack_core.c:2567
func:nf_ct_alloc_hashtable
     4136960     1010 drivers/staging/ctagmod/ctagmod.c:20 [ctagmod]
func:ctagmod_start
     3940352      962 mm/memory.c:4214 func:alloc_anon_folio
     2894464    22613 fs/kernfs/dir.c:615 func:__kernfs_new_node
     ...

Note that [ctagmod] is the only allocation from a module in this example.

>
> > +     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
> > +     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
> > +     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
> > +     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
> > +     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
> > +     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
> > +      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
> > +      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
> > +      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
> > +      ...
> > +
> > +
> >  meminfo
>
> ...
>
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 0be2d00c3696..78d258ca508f 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -972,6 +972,31 @@ config CODE_TAGGING
> >       bool
> >       select KALLSYMS
> >
> > +config MEM_ALLOC_PROFILING
> > +     bool "Enable memory allocation profiling"
> > +     default n
> > +     depends on PROC_FS
> > +     depends on !DEBUG_FORCE_WEAK_PER_CPU
> > +     select CODE_TAGGING
> > +     help
> > +       Track allocation source code and record total allocation size
> > +       initiated at that code location. The mechanism can be used to track
> > +       memory leaks with a low performance and memory impact.
> > +
> > +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > +     bool "Enable memory allocation profiling by default"
> > +     default y
>
> I'd go with default n as that I'd select for a general distro.

Well, we have MEM_ALLOC_PROFILING=n by default, so if it was switched
on manually, that is a strong sign that the user wants it enabled IMO.
So, enabling this switch by default seems logical to me. If a distro
wants to have the feature compiled in but disabled by default then
this is perfectly doable, just need to set both options appropriately.
Does my logic make sense?

>
> > +     depends on MEM_ALLOC_PROFILING
> > +
> > +config MEM_ALLOC_PROFILING_DEBUG
> > +     bool "Memory allocation profiler debugging"
> > +     default n
> > +     depends on MEM_ALLOC_PROFILING
> > +     select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
> > +     help
> > +       Adds warnings with helpful error messages for memory allocation
> > +       profiling.
> > +
>
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index c59889de122b..a214719492ea 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -43,6 +43,7 @@  Currently, these files are in /proc/sys/vm:
 - legacy_va_layout
 - lowmem_reserve_ratio
 - max_map_count
+- mem_profiling         (only if CONFIG_MEM_ALLOC_PROFILING=y)
 - memory_failure_early_kill
 - memory_failure_recovery
 - min_free_kbytes
@@ -425,6 +426,21 @@  e.g., up to one or two maps per allocation.
 The default value is 65530.
 
 
+mem_profiling
+==============
+
+Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y)
+
+1: Enable memory profiling.
+
+0: Disabld memory profiling.
+
+Enabling memory profiling introduces a small performance overhead for all
+memory allocations.
+
+The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT.
+
+
 memory_failure_early_kill:
 ==========================
 
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 104c6d047d9b..40d6d18308e4 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -688,6 +688,7 @@  files are there, and which are missing.
  ============ ===============================================================
  File         Content
  ============ ===============================================================
+ allocinfo    Memory allocations profiling information
  apm          Advanced power management info
  bootconfig   Kernel command line obtained from boot config,
  	      and, if there were kernel parameters from the
@@ -953,6 +954,33 @@  also be allocatable although a lot of filesystem metadata may have to be
 reclaimed to achieve this.
 
 
+allocinfo
+~~~~~~~
+
+Provides information about memory allocations at all locations in the code
+base. Each allocation in the code is identified by its source file, line
+number, module and the function calling the allocation. The number of bytes
+allocated at each location is reported.
+
+Example output.
+
+::
+
+    > cat /proc/allocinfo
+
+      153MiB     mm/slub.c:1826 module:slub func:alloc_slab_page
+     6.08MiB     mm/slab_common.c:950 module:slab_common func:_kmalloc_order
+     5.09MiB     mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts
+     4.54MiB     mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact
+     1.32MiB     include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one
+     1.16MiB     fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc
+     1.00MiB     mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare
+      734KiB     fs/xfs/kmem.c:20 module:xfs func:kmem_alloc
+      640KiB     kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func
+      640KiB     drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf
+      ...
+
+
 meminfo
 ~~~~~~~
 
diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h
new file mode 100644
index 000000000000..64f536b80380
--- /dev/null
+++ b/include/asm-generic/codetag.lds.h
@@ -0,0 +1,14 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_GENERIC_CODETAG_LDS_H
+#define __ASM_GENERIC_CODETAG_LDS_H
+
+#define SECTION_WITH_BOUNDARIES(_name)	\
+	. = ALIGN(8);			\
+	__start_##_name = .;		\
+	KEEP(*(_name))			\
+	__stop_##_name = .;
+
+#define CODETAG_SECTIONS()		\
+	SECTION_WITH_BOUNDARIES(alloc_tags)
+
+#endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5dd3a61d673d..c9997dc50c50 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -50,6 +50,8 @@ 
  *               [__nosave_begin, __nosave_end] for the nosave data
  */
 
+#include <asm-generic/codetag.lds.h>
+
 #ifndef LOAD_OFFSET
 #define LOAD_OFFSET 0
 #endif
@@ -366,6 +368,7 @@ 
 	. = ALIGN(8);							\
 	BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes)		\
 	BOUNDED_SECTION_BY(__dyndbg, ___dyndbg)				\
+	CODETAG_SECTIONS()						\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
new file mode 100644
index 000000000000..cf55a149fa84
--- /dev/null
+++ b/include/linux/alloc_tag.h
@@ -0,0 +1,133 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * allocation tagging
+ */
+#ifndef _LINUX_ALLOC_TAG_H
+#define _LINUX_ALLOC_TAG_H
+
+#include <linux/bug.h>
+#include <linux/codetag.h>
+#include <linux/container_of.h>
+#include <linux/preempt.h>
+#include <asm/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/static_key.h>
+
+struct alloc_tag_counters {
+	u64 bytes;
+	u64 calls;
+};
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * allocation callsite. At runtime, the special section is treated as
+ * an array of these. Embedded codetag utilizes codetag framework.
+ */
+struct alloc_tag {
+	struct codetag			ct;
+	struct alloc_tag_counters __percpu	*counters;
+} __aligned(8);
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+
+static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
+{
+	return container_of(ct, struct alloc_tag, ct);
+}
+
+#ifdef ARCH_NEEDS_WEAK_PER_CPU
+/*
+ * When percpu variables are required to be defined as weak, static percpu
+ * variables can't be used inside a function (see comments for DECLARE_PER_CPU_SECTION).
+ */
+#error "Memory allocation profiling is incompatible with ARCH_NEEDS_WEAK_PER_CPU"
+#endif
+
+#define DEFINE_ALLOC_TAG(_alloc_tag, _old)					\
+	static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr);	\
+	static struct alloc_tag _alloc_tag __used __aligned(8)			\
+	__section("alloc_tags") = {						\
+		.ct = CODE_TAG_INIT,						\
+		.counters = &_alloc_tag_cntr };					\
+	struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag)
+
+DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+			mem_alloc_profiling_key);
+
+static inline bool mem_alloc_profiling_enabled(void)
+{
+	return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+				   &mem_alloc_profiling_key);
+}
+
+static inline struct alloc_tag_counters alloc_tag_read(struct alloc_tag *tag)
+{
+	struct alloc_tag_counters v = { 0, 0 };
+	struct alloc_tag_counters *counter;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		counter = per_cpu_ptr(tag->counters, cpu);
+		v.bytes += counter->bytes;
+		v.calls += counter->calls;
+	}
+
+	return v;
+}
+
+static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+{
+	struct alloc_tag *tag;
+
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+	WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n");
+#endif
+	if (!ref || !ref->ct)
+		return;
+
+	tag = ct_to_alloc_tag(ref->ct);
+
+	this_cpu_sub(tag->counters->bytes, bytes);
+	this_cpu_dec(tag->counters->calls);
+
+	ref->ct = NULL;
+}
+
+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+{
+	__alloc_tag_sub(ref, bytes);
+}
+
+static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes)
+{
+	__alloc_tag_sub(ref, bytes);
+}
+
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes)
+{
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+	WARN_ONCE(ref && ref->ct,
+		  "alloc_tag was not cleared (got tag for %s:%u)\n",\
+		  ref->ct->filename, ref->ct->lineno);
+
+	WARN_ONCE(!tag, "current->alloc_tag not set");
+#endif
+	if (!ref || !tag)
+		return;
+
+	ref->ct = &tag->ct;
+	this_cpu_add(tag->counters->bytes, bytes);
+	this_cpu_inc(tag->counters->calls);
+}
+
+#else
+
+#define DEFINE_ALLOC_TAG(_alloc_tag, _old)
+static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
+static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {}
+static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag,
+				 size_t bytes) {}
+
+#endif
+
+#endif /* _LINUX_ALLOC_TAG_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..da68a10517c8 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -770,6 +770,10 @@  struct task_struct {
 	unsigned int			flags;
 	unsigned int			ptrace;
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+	struct alloc_tag		*alloc_tag;
+#endif
+
 #ifdef CONFIG_SMP
 	int				on_cpu;
 	struct __call_single_node	wake_entry;
@@ -810,6 +814,7 @@  struct task_struct {
 	struct task_group		*sched_task_group;
 #endif
 
+
 #ifdef CONFIG_UCLAMP_TASK
 	/*
 	 * Clamp values requested for a scheduling entity.
@@ -2183,4 +2188,23 @@  static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); }
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
+#ifdef CONFIG_MEM_ALLOC_PROFILING
+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag)
+{
+	swap(current->alloc_tag, tag);
+	return tag;
+}
+
+static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old)
+{
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+	WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n");
+#endif
+	current->alloc_tag = old;
+}
+#else
+static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; }
+#define alloc_tag_restore(_tag, _old)
+#endif
+
 #endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0be2d00c3696..78d258ca508f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -972,6 +972,31 @@  config CODE_TAGGING
 	bool
 	select KALLSYMS
 
+config MEM_ALLOC_PROFILING
+	bool "Enable memory allocation profiling"
+	default n
+	depends on PROC_FS
+	depends on !DEBUG_FORCE_WEAK_PER_CPU
+	select CODE_TAGGING
+	help
+	  Track allocation source code and record total allocation size
+	  initiated at that code location. The mechanism can be used to track
+	  memory leaks with a low performance and memory impact.
+
+config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+	bool "Enable memory allocation profiling by default"
+	default y
+	depends on MEM_ALLOC_PROFILING
+
+config MEM_ALLOC_PROFILING_DEBUG
+	bool "Memory allocation profiler debugging"
+	default n
+	depends on MEM_ALLOC_PROFILING
+	select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT
+	help
+	  Adds warnings with helpful error messages for memory allocation
+	  profiling.
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 source "lib/Kconfig.kmsan"
diff --git a/lib/Makefile b/lib/Makefile
index 6b48b22fdfac..859112f09bf5 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -236,6 +236,8 @@  obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_CODE_TAGGING) += codetag.o
+obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o
+
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
new file mode 100644
index 000000000000..4fc031f9cefd
--- /dev/null
+++ b/lib/alloc_tag.c
@@ -0,0 +1,158 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/alloc_tag.h>
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+
+static struct codetag_type *alloc_tag_cttype;
+
+DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT,
+			mem_alloc_profiling_key);
+
+static void *allocinfo_start(struct seq_file *m, loff_t *pos)
+{
+	struct codetag_iterator *iter;
+	struct codetag *ct;
+	loff_t node = *pos;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	m->private = iter;
+	if (!iter)
+		return NULL;
+
+	codetag_lock_module_list(alloc_tag_cttype, true);
+	*iter = codetag_get_ct_iter(alloc_tag_cttype);
+	while ((ct = codetag_next_ct(iter)) != NULL && node)
+		node--;
+
+	return ct ? iter : NULL;
+}
+
+static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
+	struct codetag *ct = codetag_next_ct(iter);
+
+	(*pos)++;
+	if (!ct)
+		return NULL;
+
+	return iter;
+}
+
+static void allocinfo_stop(struct seq_file *m, void *arg)
+{
+	struct codetag_iterator *iter = (struct codetag_iterator *)m->private;
+
+	if (iter) {
+		codetag_lock_module_list(alloc_tag_cttype, false);
+		kfree(iter);
+	}
+}
+
+static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct)
+{
+	struct alloc_tag *tag = ct_to_alloc_tag(ct);
+	struct alloc_tag_counters counter = alloc_tag_read(tag);
+	s64 bytes = counter.bytes;
+	char val[10], *p = val;
+
+	if (bytes < 0) {
+		*p++ = '-';
+		bytes = -bytes;
+	}
+
+	string_get_size(bytes, 1,
+			STRING_SIZE_BASE2|STRING_SIZE_NOSPACE,
+			p, val + ARRAY_SIZE(val) - p);
+
+	seq_buf_printf(out, "%8s %8llu ", val, counter.calls);
+	codetag_to_text(out, ct);
+	seq_buf_putc(out, ' ');
+	seq_buf_putc(out, '\n');
+}
+
+static int allocinfo_show(struct seq_file *m, void *arg)
+{
+	struct codetag_iterator *iter = (struct codetag_iterator *)arg;
+	char *bufp;
+	size_t n = seq_get_buf(m, &bufp);
+	struct seq_buf buf;
+
+	seq_buf_init(&buf, bufp, n);
+	alloc_tag_to_text(&buf, iter->ct);
+	seq_commit(m, seq_buf_used(&buf));
+	return 0;
+}
+
+static const struct seq_operations allocinfo_seq_op = {
+	.start	= allocinfo_start,
+	.next	= allocinfo_next,
+	.stop	= allocinfo_stop,
+	.show	= allocinfo_show,
+};
+
+static void __init procfs_init(void)
+{
+	proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op);
+}
+
+static bool alloc_tag_module_unload(struct codetag_type *cttype,
+				    struct codetag_module *cmod)
+{
+	struct codetag_iterator iter = codetag_get_ct_iter(cttype);
+	struct alloc_tag_counters counter;
+	bool module_unused = true;
+	struct alloc_tag *tag;
+	struct codetag *ct;
+
+	for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
+		if (iter.cmod != cmod)
+			continue;
+
+		tag = ct_to_alloc_tag(ct);
+		counter = alloc_tag_read(tag);
+
+		if (WARN(counter.bytes, "%s:%u module %s func:%s has %llu allocated at module unload",
+			  ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
+			module_unused = false;
+	}
+
+	return module_unused;
+}
+
+static struct ctl_table memory_allocation_profiling_sysctls[] = {
+	{
+		.procname	= "mem_profiling",
+		.data		= &mem_alloc_profiling_key,
+#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
+		.mode		= 0444,
+#else
+		.mode		= 0644,
+#endif
+		.proc_handler	= proc_do_static_key,
+	},
+	{ }
+};
+
+static int __init alloc_tag_init(void)
+{
+	const struct codetag_type_desc desc = {
+		.section	= "alloc_tags",
+		.tag_size	= sizeof(struct alloc_tag),
+		.module_unload	= alloc_tag_module_unload,
+	};
+
+	alloc_tag_cttype = codetag_register_type(&desc);
+	if (IS_ERR_OR_NULL(alloc_tag_cttype))
+		return PTR_ERR(alloc_tag_cttype);
+
+	register_sysctl_init("vm", memory_allocation_profiling_sysctls);
+	procfs_init();
+
+	return 0;
+}
+module_init(alloc_tag_init);
diff --git a/scripts/module.lds.S b/scripts/module.lds.S
index bf5bcf2836d8..45c67a0994f3 100644
--- a/scripts/module.lds.S
+++ b/scripts/module.lds.S
@@ -9,6 +9,8 @@ 
 #define DISCARD_EH_FRAME	*(.eh_frame)
 #endif
 
+#include <asm-generic/codetag.lds.h>
+
 SECTIONS {
 	/DISCARD/ : {
 		*(.discard)
@@ -47,12 +49,17 @@  SECTIONS {
 	.data : {
 		*(.data .data.[0-9a-zA-Z_]*)
 		*(.data..L*)
+		CODETAG_SECTIONS()
 	}
 
 	.rodata : {
 		*(.rodata .rodata.[0-9a-zA-Z_]*)
 		*(.rodata..L*)
 	}
+#else
+	.data : {
+		CODETAG_SECTIONS()
+	}
 #endif
 }