diff mbox series

[v18,02/14] mm: Introduce Data Access MONitor (DAMON)

Message ID 20200713084144.4430-3-sjpark@amazon.com (mailing list archive)
State New, archived
Headers show
Series Introduce Data Access MONitor (DAMON) | expand

Commit Message

SeongJae Park July 13, 2020, 8:41 a.m. UTC
From: SeongJae Park <sjpark@amazon.de>

DAMON is a data access monitoring framework subsystem for the Linux
kernel.  The core mechanisms of DAMON make it

 - accurate (the monitoring output is useful enough for DRAM level
   memory management; It might not appropriate for CPU Cache levels,
   though),
 - light-weight (the monitoring overhead is low enough to be applied
   online), and
 - scalable (the upper-bound of the overhead is in constant range
   regardless of the size of target workloads).

Using this framework, therefore, the kernel's memory management
mechanisms can make advanced decisions.  Experimental memory management
optimization works that incurring high data accesses monitoring overhead
could implemented again.  In user space, meanwhile, users who have some
special workloads can write personalized applications for better
understanding and optimizations of their workloads and systems.

This commit is implementing only the stub for the module load/unload,
basic data structures, and simple manipulation functions of the
structures to keep the size of commit small.  The core mechanisms of
DAMON will be implemented one by one by following commits.

Signed-off-by: SeongJae Park <sjpark@amazon.de>
Reviewed-by: Leonard Foerster <foersleo@amazon.de>
Reviewed-by: Varad Gautam <vrd@amazon.de>
---
 include/linux/damon.h |  63 ++++++++++++++
 mm/Kconfig            |  12 +++
 mm/Makefile           |   1 +
 mm/damon.c            | 188 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 264 insertions(+)
 create mode 100644 include/linux/damon.h
 create mode 100644 mm/damon.c

Comments

Shakeel Butt July 18, 2020, 2:47 a.m. UTC | #1
On Mon, Jul 13, 2020 at 1:43 AM SeongJae Park <sjpark@amazon.com> wrote:
>
> From: SeongJae Park <sjpark@amazon.de>
>
> DAMON is a data access monitoring framework subsystem for the Linux
> kernel.  The core mechanisms of DAMON make it
>
>  - accurate (the monitoring output is useful enough for DRAM level
>    memory management; It might not appropriate for CPU Cache levels,
>    though),
>  - light-weight (the monitoring overhead is low enough to be applied
>    online), and
>  - scalable (the upper-bound of the overhead is in constant range
>    regardless of the size of target workloads).
>
> Using this framework, therefore, the kernel's memory management
> mechanisms can make advanced decisions.  Experimental memory management
> optimization works that incurring high data accesses monitoring overhead
> could implemented again.  In user space, meanwhile, users who have some
> special workloads can write personalized applications for better
> understanding and optimizations of their workloads and systems.
>
> This commit is implementing only the stub for the module load/unload,
> basic data structures, and simple manipulation functions of the
> structures to keep the size of commit small.  The core mechanisms of
> DAMON will be implemented one by one by following commits.
>
> Signed-off-by: SeongJae Park <sjpark@amazon.de>
> Reviewed-by: Leonard Foerster <foersleo@amazon.de>
> Reviewed-by: Varad Gautam <vrd@amazon.de>
> ---
>  include/linux/damon.h |  63 ++++++++++++++
>  mm/Kconfig            |  12 +++
>  mm/Makefile           |   1 +
>  mm/damon.c            | 188 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 264 insertions(+)
>  create mode 100644 include/linux/damon.h
>  create mode 100644 mm/damon.c
>
> diff --git a/include/linux/damon.h b/include/linux/damon.h
> new file mode 100644
> index 000000000000..c8f8c1c41a45
> --- /dev/null
> +++ b/include/linux/damon.h
> @@ -0,0 +1,63 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * DAMON api
> + *
> + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates.
> + *
> + * Author: SeongJae Park <sjpark@amazon.de>
> + */
> +
> +#ifndef _DAMON_H_
> +#define _DAMON_H_
> +
> +#include <linux/random.h>
> +#include <linux/types.h>
> +
> +/**
> + * struct damon_addr_range - Represents an address region of [@start, @end).
> + * @start:     Start address of the region (inclusive).
> + * @end:       End address of the region (exclusive).
> + */
> +struct damon_addr_range {
> +       unsigned long start;
> +       unsigned long end;
> +};
> +
> +/**
> + * struct damon_region - Represents a monitoring target region.
> + * @ar:                        The address range of the region.
> + * @sampling_addr:     Address of the sample for the next access check.
> + * @nr_accesses:       Access frequency of this region.
> + * @list:              List head for siblings.
> + */
> +struct damon_region {
> +       struct damon_addr_range ar;
> +       unsigned long sampling_addr;
> +       unsigned int nr_accesses;
> +       struct list_head list;
> +};
> +
> +/**
> + * struct damon_task - Represents a monitoring target task.
> + * @pid:               Process id of the task.
> + * @regions_list:      Head of the monitoring target regions of this task.
> + * @list:              List head for siblings.
> + *
> + * If the monitoring target address space is task independent (e.g., physical
> + * memory address space monitoring), @pid should be '-1'.
> + */
> +struct damon_task {
> +       int pid;

Storing and accessing pid like this is racy. Why not save the "struct
pid" after getting the reference? I am still going over the usage,
maybe storing mm_struct would be an even better choice.

> +       struct list_head regions_list;
> +       struct list_head list;
> +};
> +
> +/**
> + * struct damon_ctx - Represents a context for each monitoring.
> + * @tasks_list:                Head of monitoring target tasks (&damon_task) list.
> + */
> +struct damon_ctx {
> +       struct list_head tasks_list;    /* 'damon_task' objects */
> +};
> +
> +#endif
> diff --git a/mm/Kconfig b/mm/Kconfig
> index c1acc34c1c35..464e9594dcec 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -867,4 +867,16 @@ config ARCH_HAS_HUGEPD
>  config MAPPING_DIRTY_HELPERS
>          bool
>
> +config DAMON
> +       tristate "Data Access Monitor"
> +       depends on MMU
> +       help
> +         This feature allows to monitor access frequency of each memory
> +         region. The information can be useful for performance-centric DRAM
> +         level memory management.
> +
> +         See https://damonitor.github.io/doc/html/latest-damon/index.html for
> +         more information.
> +         If unsure, say N.
> +
>  endmenu
> diff --git a/mm/Makefile b/mm/Makefile
> index fccd3756b25f..230e545b6e07 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -112,3 +112,4 @@ obj-$(CONFIG_MEMFD_CREATE) += memfd.o
>  obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
>  obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
>  obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
> +obj-$(CONFIG_DAMON) += damon.o
> diff --git a/mm/damon.c b/mm/damon.c
> new file mode 100644
> index 000000000000..5ab13b1c15cf
> --- /dev/null
> +++ b/mm/damon.c
> @@ -0,0 +1,188 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Data Access Monitor
> + *
> + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates.
> + *
> + * Author: SeongJae Park <sjpark@amazon.de>
> + *
> + * This file is constructed in below parts.
> + *
> + * - Functions and macros for DAMON data structures
> + * - Functions for the module loading/unloading
> + *
> + * The core parts are not implemented yet.
> + */
> +
> +#define pr_fmt(fmt) "damon: " fmt
> +
> +#include <linux/damon.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +
> +/*
> + * Functions and macros for DAMON data structures
> + */
> +
> +#define damon_get_task_struct(t) \
> +       (get_pid_task(find_vpid(t->pid), PIDTYPE_PID))

You need at least rcu lock around find_vpid(). Also you need to be
careful about the context. If you accept my previous suggestion then
you just need to do this in the process context which is registering
the pid (no need to worry about the pid namespace).

I am wondering if there should be an interface to register processes
with DAMON using pidfd instead of integer pid.

> +
> +#define damon_next_region(r) \
> +       (container_of(r->list.next, struct damon_region, list))
> +
> +#define damon_prev_region(r) \
> +       (container_of(r->list.prev, struct damon_region, list))
> +
> +#define damon_for_each_region(r, t) \
> +       list_for_each_entry(r, &t->regions_list, list)
> +
> +#define damon_for_each_region_safe(r, next, t) \
> +       list_for_each_entry_safe(r, next, &t->regions_list, list)
> +
> +#define damon_for_each_task(t, ctx) \
> +       list_for_each_entry(t, &(ctx)->tasks_list, list)
> +
> +#define damon_for_each_task_safe(t, next, ctx) \
> +       list_for_each_entry_safe(t, next, &(ctx)->tasks_list, list)
> +
> +/* Get a random number in [l, r) */
> +#define damon_rand(l, r) (l + prandom_u32() % (r - l))
> +
> +/*
> + * Construct a damon_region struct
> + *
> + * Returns the pointer to the new struct if success, or NULL otherwise
> + */
> +static struct damon_region *damon_new_region(unsigned long start,
> +                                            unsigned long end)
> +{
> +       struct damon_region *region;
> +
> +       region = kmalloc(sizeof(*region), GFP_KERNEL);
> +       if (!region)
> +               return NULL;
> +
> +       region->ar.start = start;
> +       region->ar.end = end;
> +       region->nr_accesses = 0;
> +       INIT_LIST_HEAD(&region->list);
> +
> +       return region;
> +}
> +
> +/*
> + * Add a region between two other regions
> + */
> +static inline void damon_insert_region(struct damon_region *r,
> +               struct damon_region *prev, struct damon_region *next)
> +{
> +       __list_add(&r->list, &prev->list, &next->list);
> +}
> +
> +static void damon_add_region(struct damon_region *r, struct damon_task *t)
> +{
> +       list_add_tail(&r->list, &t->regions_list);
> +}
> +
> +static void damon_del_region(struct damon_region *r)
> +{
> +       list_del(&r->list);
> +}
> +
> +static void damon_free_region(struct damon_region *r)
> +{
> +       kfree(r);
> +}
> +
> +static void damon_destroy_region(struct damon_region *r)
> +{
> +       damon_del_region(r);
> +       damon_free_region(r);
> +}
> +
> +/*
> + * Construct a damon_task struct
> + *
> + * Returns the pointer to the new struct if success, or NULL otherwise
> + */
> +static struct damon_task *damon_new_task(int pid)
> +{
> +       struct damon_task *t;
> +
> +       t = kmalloc(sizeof(*t), GFP_KERNEL);
> +       if (!t)
> +               return NULL;
> +
> +       t->pid = pid;
> +       INIT_LIST_HEAD(&t->regions_list);
> +
> +       return t;
> +}
> +
> +static void damon_add_task(struct damon_ctx *ctx, struct damon_task *t)
> +{
> +       list_add_tail(&t->list, &ctx->tasks_list);
> +}
> +
> +static void damon_del_task(struct damon_task *t)
> +{
> +       list_del(&t->list);
> +}
> +
> +static void damon_free_task(struct damon_task *t)
> +{
> +       struct damon_region *r, *next;
> +
> +       damon_for_each_region_safe(r, next, t)
> +               damon_free_region(r);
> +       kfree(t);
> +}
> +
> +static void damon_destroy_task(struct damon_task *t)
> +{
> +       damon_del_task(t);
> +       damon_free_task(t);
> +}
> +
> +static unsigned int nr_damon_tasks(struct damon_ctx *ctx)
> +{
> +       struct damon_task *t;
> +       unsigned int nr_tasks = 0;
> +
> +       damon_for_each_task(t, ctx)
> +               nr_tasks++;
> +
> +       return nr_tasks;
> +}
> +
> +static unsigned int nr_damon_regions(struct damon_task *t)
> +{
> +       struct damon_region *r;
> +       unsigned int nr_regions = 0;
> +
> +       damon_for_each_region(r, t)
> +               nr_regions++;
> +
> +       return nr_regions;
> +}
> +
> +/*
> + * Functions for the module loading/unloading
> + */
> +
> +static int __init damon_init(void)
> +{
> +       return 0;
> +}
> +
> +static void __exit damon_exit(void)
> +{
> +}
> +
> +module_init(damon_init);
> +module_exit(damon_exit);
> +
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("SeongJae Park <sjpark@amazon.de>");
> +MODULE_DESCRIPTION("DAMON: Data Access MONitor");
> --
> 2.17.1
>
Shakeel Butt July 29, 2020, 3:31 p.m. UTC | #2
On Sat, Jul 18, 2020 at 6:31 AM SeongJae Park <sjpark@amazon.com> wrote:
>
> On Fri, 17 Jul 2020 19:47:50 -0700 Shakeel Butt <shakeelb@google.com> wrote:
>
> > On Mon, Jul 13, 2020 at 1:43 AM SeongJae Park <sjpark@amazon.com> wrote:
> > >
> > > From: SeongJae Park <sjpark@amazon.de>
> > >
> > > DAMON is a data access monitoring framework subsystem for the Linux
> > > kernel.  The core mechanisms of DAMON make it
> > >
> > >  - accurate (the monitoring output is useful enough for DRAM level
> > >    memory management; It might not appropriate for CPU Cache levels,
> > >    though),
> > >  - light-weight (the monitoring overhead is low enough to be applied
> > >    online), and
> > >  - scalable (the upper-bound of the overhead is in constant range
> > >    regardless of the size of target workloads).
> > >
> > > Using this framework, therefore, the kernel's memory management
> > > mechanisms can make advanced decisions.  Experimental memory management
> > > optimization works that incurring high data accesses monitoring overhead
> > > could implemented again.  In user space, meanwhile, users who have some
> > > special workloads can write personalized applications for better
> > > understanding and optimizations of their workloads and systems.
> > >
> > > This commit is implementing only the stub for the module load/unload,
> > > basic data structures, and simple manipulation functions of the
> > > structures to keep the size of commit small.  The core mechanisms of
> > > DAMON will be implemented one by one by following commits.
> > >
> > > Signed-off-by: SeongJae Park <sjpark@amazon.de>
> > > Reviewed-by: Leonard Foerster <foersleo@amazon.de>
> > > Reviewed-by: Varad Gautam <vrd@amazon.de>
> > > ---
> > >  include/linux/damon.h |  63 ++++++++++++++
> > >  mm/Kconfig            |  12 +++
> > >  mm/Makefile           |   1 +
> > >  mm/damon.c            | 188 ++++++++++++++++++++++++++++++++++++++++++
> > >  4 files changed, 264 insertions(+)
> > >  create mode 100644 include/linux/damon.h
> > >  create mode 100644 mm/damon.c
> > >
> > > diff --git a/include/linux/damon.h b/include/linux/damon.h
> > > new file mode 100644
> > > index 000000000000..c8f8c1c41a45
> > > --- /dev/null
> > > +++ b/include/linux/damon.h
> > > @@ -0,0 +1,63 @@
> > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > +/*
> > > + * DAMON api
> > > + *
> > > + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates.
> > > + *
> > > + * Author: SeongJae Park <sjpark@amazon.de>
> > > + */
> > > +
> [...]
> > > +
> > > +/**
> > > + * struct damon_task - Represents a monitoring target task.
> > > + * @pid:               Process id of the task.
> > > + * @regions_list:      Head of the monitoring target regions of this task.
> > > + * @list:              List head for siblings.
> > > + *
> > > + * If the monitoring target address space is task independent (e.g., physical
> > > + * memory address space monitoring), @pid should be '-1'.
> > > + */
> > > +struct damon_task {
> > > +       int pid;
> >
> > Storing and accessing pid like this is racy. Why not save the "struct
> > pid" after getting the reference? I am still going over the usage,
> > maybe storing mm_struct would be an even better choice.
> >
> > > +       struct list_head regions_list;
> > > +       struct list_head list;
> > > +};
> > > +
> [...]
> > > +
> > > +#define damon_get_task_struct(t) \
> > > +       (get_pid_task(find_vpid(t->pid), PIDTYPE_PID))
> >
> > You need at least rcu lock around find_vpid(). Also you need to be
> > careful about the context. If you accept my previous suggestion then
> > you just need to do this in the process context which is registering
> > the pid (no need to worry about the pid namespace).
> >
> > I am wondering if there should be an interface to register processes
> > with DAMON using pidfd instead of integer pid.
>
> Good points!  I will use pidfd for this purpose, instead.
>
> BTW, 'struct damon_task' was introduced while DAMON supports only virtual
> address spaces and recently extended to support physical memory address
> monitoring case by defining an exceptional pid (-1) for such case.  I think it
> doesn't smoothly fit with the design.
>
> Therefore, I would like to change it with more general named struct, e.g.,
>
>     struct damon_target {
>             void *id;
>             struct list_head regions_list;
>             struct list_head list;
>     };
>
> The 'id' field will be able to store or point pid_t, struct mm_struct, struct
> pid, or anything relevant, depending on the target address space.
>
> Only one part of the address space independent logics of DAMON, namely
> 'kdamon_need_stop()', uses '->pid' of the 'struct damon_task'.  It will be
> introduced by the next patch ("mm/damon: Implement region based sampling").
> Therefore, the conversion will be easy.  For the part, I could add another
> callback, e.g.,
>
>     struct damon_ctx {
>             [...]
>             bool (*is_target_valid)(struct damon_target *t);
>     };
>
> And let the address space specific primitives to implement this.
>
> Then, damon_get_task_struct() and damon_get_mm() will be introduced by the
> sixth patch ("mm/damon: Implement callbacks for the virtual memory address
> spaces") as a part of the virtual address space specific primitives
> implementation.
>
> I gonna make the change in the next spin.  If you have some opinions on this,
> please let me know.
>
>

Sorry for the late response. I think the general direction you are
taking is fine but there are still some open questions. I am trying to
reason if 'address space' is general enough abstraction for different
types of monitoring targets. It fits well for the 'processes' targets.
For the physical memory, the monitoring part of the abstraction (i.e.
damon_ctx) seems fine but I am not sure about the optimization part
(i.e. [merge|split]_regions) which raises the question that should the
merge/split functionality be part of the abstraction.

I am also very interested in the 'cgroups' as the target and I am not
sure if 'address space' is the right abstraction for the cgroups as
well. Well we can think of cgroups as a combination of tasks but
cgroup also contains unmapped pages. So, maybe it is a combination of
virtual and physical address space targets damon can monitor but I am
still not clear how to specify that in the abstractions provided by
damon. Anyways these are the questions for later and we can start
simple with just processes but I would like to not expose these
abstractions/interfaces to userspace otherwise it would be really hard
to change later.

Another topic I want to discuss is managing/charging the resource
(cpu) usage of monitoring. Yes, damon with optimization has low cpu
cost but as the number of targets increase the cpu cost will increase
which will be in a range which can not be ignored as system overhead.
At the moment, it seems like there is one kthread doing all the
monitoring, since we can control the cpu usage of kthreads, it might
make sense to allow different kthreads for different sets of targets
(processes in a cgroup).
SeongJae Park July 29, 2020, 5:29 p.m. UTC | #3
On Wed, 29 Jul 2020 08:31:29 -0700 Shakeel Butt <shakeelb@google.com> wrote:

> On Sat, Jul 18, 2020 at 6:31 AM SeongJae Park <sjpark@amazon.com> wrote:
> >
> > On Fri, 17 Jul 2020 19:47:50 -0700 Shakeel Butt <shakeelb@google.com> wrote:
> >
> > > On Mon, Jul 13, 2020 at 1:43 AM SeongJae Park <sjpark@amazon.com> wrote:
> > > >
> > > > From: SeongJae Park <sjpark@amazon.de>
> > > >
> > > > DAMON is a data access monitoring framework subsystem for the Linux
> > > > kernel.  The core mechanisms of DAMON make it
> > > >
> > > >  - accurate (the monitoring output is useful enough for DRAM level
> > > >    memory management; It might not appropriate for CPU Cache levels,
> > > >    though),
> > > >  - light-weight (the monitoring overhead is low enough to be applied
> > > >    online), and
> > > >  - scalable (the upper-bound of the overhead is in constant range
> > > >    regardless of the size of target workloads).
> > > >
> > > > Using this framework, therefore, the kernel's memory management
> > > > mechanisms can make advanced decisions.  Experimental memory management
> > > > optimization works that incurring high data accesses monitoring overhead
> > > > could implemented again.  In user space, meanwhile, users who have some
> > > > special workloads can write personalized applications for better
> > > > understanding and optimizations of their workloads and systems.
> > > >
> > > > This commit is implementing only the stub for the module load/unload,
> > > > basic data structures, and simple manipulation functions of the
> > > > structures to keep the size of commit small.  The core mechanisms of
> > > > DAMON will be implemented one by one by following commits.
> > > >
> > > > Signed-off-by: SeongJae Park <sjpark@amazon.de>
> > > > Reviewed-by: Leonard Foerster <foersleo@amazon.de>
> > > > Reviewed-by: Varad Gautam <vrd@amazon.de>
> > > > ---
> > > >  include/linux/damon.h |  63 ++++++++++++++
> > > >  mm/Kconfig            |  12 +++
> > > >  mm/Makefile           |   1 +
> > > >  mm/damon.c            | 188 ++++++++++++++++++++++++++++++++++++++++++
> > > >  4 files changed, 264 insertions(+)
> > > >  create mode 100644 include/linux/damon.h
> > > >  create mode 100644 mm/damon.c
> > > >
> > > > diff --git a/include/linux/damon.h b/include/linux/damon.h
> > > > new file mode 100644
> > > > index 000000000000..c8f8c1c41a45
> > > > --- /dev/null
> > > > +++ b/include/linux/damon.h
> > > > @@ -0,0 +1,63 @@
> > > > +/* SPDX-License-Identifier: GPL-2.0 */
> > > > +/*
> > > > + * DAMON api
> > > > + *
> > > > + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates.
> > > > + *
> > > > + * Author: SeongJae Park <sjpark@amazon.de>
> > > > + */
> > > > +
> > [...]
> > > > +
> > > > +/**
> > > > + * struct damon_task - Represents a monitoring target task.
> > > > + * @pid:               Process id of the task.
> > > > + * @regions_list:      Head of the monitoring target regions of this task.
> > > > + * @list:              List head for siblings.
> > > > + *
> > > > + * If the monitoring target address space is task independent (e.g., physical
> > > > + * memory address space monitoring), @pid should be '-1'.
> > > > + */
> > > > +struct damon_task {
> > > > +       int pid;
> > >
> > > Storing and accessing pid like this is racy. Why not save the "struct
> > > pid" after getting the reference? I am still going over the usage,
> > > maybe storing mm_struct would be an even better choice.
> > >
> > > > +       struct list_head regions_list;
> > > > +       struct list_head list;
> > > > +};
> > > > +
> > [...]
> > > > +
> > > > +#define damon_get_task_struct(t) \
> > > > +       (get_pid_task(find_vpid(t->pid), PIDTYPE_PID))
> > >
> > > You need at least rcu lock around find_vpid(). Also you need to be
> > > careful about the context. If you accept my previous suggestion then
> > > you just need to do this in the process context which is registering
> > > the pid (no need to worry about the pid namespace).
> > >
> > > I am wondering if there should be an interface to register processes
> > > with DAMON using pidfd instead of integer pid.
> >
> > Good points!  I will use pidfd for this purpose, instead.
> >
> > BTW, 'struct damon_task' was introduced while DAMON supports only virtual
> > address spaces and recently extended to support physical memory address
> > monitoring case by defining an exceptional pid (-1) for such case.  I think it
> > doesn't smoothly fit with the design.
> >
> > Therefore, I would like to change it with more general named struct, e.g.,
> >
> >     struct damon_target {
> >             void *id;
> >             struct list_head regions_list;
> >             struct list_head list;
> >     };
> >
> > The 'id' field will be able to store or point pid_t, struct mm_struct, struct
> > pid, or anything relevant, depending on the target address space.
> >
> > Only one part of the address space independent logics of DAMON, namely
> > 'kdamon_need_stop()', uses '->pid' of the 'struct damon_task'.  It will be
> > introduced by the next patch ("mm/damon: Implement region based sampling").
> > Therefore, the conversion will be easy.  For the part, I could add another
> > callback, e.g.,
> >
> >     struct damon_ctx {
> >             [...]
> >             bool (*is_target_valid)(struct damon_target *t);
> >     };
> >
> > And let the address space specific primitives to implement this.
> >
> > Then, damon_get_task_struct() and damon_get_mm() will be introduced by the
> > sixth patch ("mm/damon: Implement callbacks for the virtual memory address
> > spaces") as a part of the virtual address space specific primitives
> > implementation.
> >
> > I gonna make the change in the next spin.  If you have some opinions on this,
> > please let me know.
> >
> >
> 
> Sorry for the late response. I think the general direction you are
> taking is fine but there are still some open questions. I am trying to
> reason if 'address space' is general enough abstraction for different
> types of monitoring targets. It fits well for the 'processes' targets.

Agreed, and that's why I'm planning to make the target as a simple
identifier[1] that independent of the real types of the targets.  It will be
interpreted by only the low level primitive implementation.  From the core
logic's perspective, it will not be interpreted at all but only required to be
unique among the targets of the monitoring context (damon_ctx).  The changed
version will be posted in the next spin.

[1] https://damonitor.github.io/doc/html/next/vm/damon/api.html#c.damon_target

> For the physical memory, the monitoring part of the abstraction (i.e.
> damon_ctx) seems fine but I am not sure about the optimization part
> (i.e. [merge|split]_regions) which raises the question that should the
> merge/split functionality be part of the abstraction.

I would like to argue that the optimization is the core part of DAMON that
makes its difference: It provides _controllable_ tradeoff between the accuracy
and the overhead while doing the best effort for the accuracy.  Even on
physical memory space, the logic works well if the memory is not highly
segmented.  Even under the significant segmentation, it could provide a sign of
compaction necessity.  I also believe compaction and DAMON can complement each
other.  Nonetheless, the optimization could be turned off by manipulating the
DAMON's attributes if needed.

> 
> I am also very interested in the 'cgroups' as the target and I am not
> sure if 'address space' is the right abstraction for the cgroups as
> well. Well we can think of cgroups as a combination of tasks but
> cgroup also contains unmapped pages. So, maybe it is a combination of
> virtual and physical address space targets damon can monitor but I am
> still not clear how to specify that in the abstractions provided by
> damon.

In this case, because DAMON provides the low level primitives for the virtual
address space and the physical address space itself, the user could simply make
two DAMON contexts, one for the virtual address parts of the processes in the
cgroup and one for physical address part of the cgroup and run those in
parallel.

Or, you could implement another low level primitives for cgroups usecase.  If
your usecase is somewhat complex and need some level of optimization, I think
this is the right way.  The low level primitive implementation should provide
methods[1] for 1) identifying the target, 2) constructing the monitoring target
address ranges, and 3) checking access of small address ranges in the target
ranges.  The implementation for cgroups could 1) use cgroup ids as the target
ids, 2) construct the target address ranges by integrating the address spaces
of the processes in the cgroup and the related physical memory regions (maybe
you could convert every address ranges to physical address), and 3) use page
table walk/rmap and PTE/PMD Accessed bit for the access checking.

[1] https://damonitor.github.io/doc/html/next/vm/damon/design.html#configurable-layers

> Anyways these are the questions for later and we can start
> simple with just processes but I would like to not expose these
> abstractions/interfaces to userspace otherwise it would be really hard
> to change later.

Agreed.  For now, DAMON provides three interfaces for the user space.  First
and the main one is the debugfs interface[1].  I chose debugfs because it's
relatively easy to change the interface.  Second one is tracepoint[2].  It only
provides monitoring time, the target id, and the monitored access frequency of
each region.  I believe this is sufficiently abstracted format.  Final one is
user space tool[3] written in python.  It is only a wrapper of the debugfs
interface.  I believe keeping the higher level interface will not be hard.


[1] 9th patch of this series: https://lore.kernel.org/linux-mm/20200713084144.4430-10-sjpark@amazon.com/
[2] 8th patch of this series: https://lore.kernel.org/linux-mm/20200713084144.4430-9-sjpark@amazon.com/
[3] 10th patch of this series: https://lore.kernel.org/linux-mm/20200713084144.4430-11-sjpark@amazon.com/

> 
> Another topic I want to discuss is managing/charging the resource
> (cpu) usage of monitoring. Yes, damon with optimization has low cpu
> cost but as the number of targets increase the cpu cost will increase
> which will be in a range which can not be ignored as system overhead.
> At the moment, it seems like there is one kthread doing all the
> monitoring, since we can control the cpu usage of kthreads, it might
> make sense to allow different kthreads for different sets of targets
> (processes in a cgroup).

Each DAMON context is proccessed by one kthread.  Therefore, running each of or
a group of targets in dedicated kthread is naturally supported.

Thanks again for the questions.  If I misunderstood your points or my answers
are insufficient to understand, please don't hesitate letting me know :)


Thanks,
SeongJae Park
diff mbox series

Patch

diff --git a/include/linux/damon.h b/include/linux/damon.h
new file mode 100644
index 000000000000..c8f8c1c41a45
--- /dev/null
+++ b/include/linux/damon.h
@@ -0,0 +1,63 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * DAMON api
+ *
+ * Copyright 2019-2020 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ */
+
+#ifndef _DAMON_H_
+#define _DAMON_H_
+
+#include <linux/random.h>
+#include <linux/types.h>
+
+/**
+ * struct damon_addr_range - Represents an address region of [@start, @end).
+ * @start:	Start address of the region (inclusive).
+ * @end:	End address of the region (exclusive).
+ */
+struct damon_addr_range {
+	unsigned long start;
+	unsigned long end;
+};
+
+/**
+ * struct damon_region - Represents a monitoring target region.
+ * @ar:			The address range of the region.
+ * @sampling_addr:	Address of the sample for the next access check.
+ * @nr_accesses:	Access frequency of this region.
+ * @list:		List head for siblings.
+ */
+struct damon_region {
+	struct damon_addr_range ar;
+	unsigned long sampling_addr;
+	unsigned int nr_accesses;
+	struct list_head list;
+};
+
+/**
+ * struct damon_task - Represents a monitoring target task.
+ * @pid:		Process id of the task.
+ * @regions_list:	Head of the monitoring target regions of this task.
+ * @list:		List head for siblings.
+ *
+ * If the monitoring target address space is task independent (e.g., physical
+ * memory address space monitoring), @pid should be '-1'.
+ */
+struct damon_task {
+	int pid;
+	struct list_head regions_list;
+	struct list_head list;
+};
+
+/**
+ * struct damon_ctx - Represents a context for each monitoring.
+ * @tasks_list:		Head of monitoring target tasks (&damon_task) list.
+ */
+struct damon_ctx {
+	struct list_head tasks_list;	/* 'damon_task' objects */
+};
+
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c35..464e9594dcec 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -867,4 +867,16 @@  config ARCH_HAS_HUGEPD
 config MAPPING_DIRTY_HELPERS
         bool
 
+config DAMON
+	tristate "Data Access Monitor"
+	depends on MMU
+	help
+	  This feature allows to monitor access frequency of each memory
+	  region. The information can be useful for performance-centric DRAM
+	  level memory management.
+
+	  See https://damonitor.github.io/doc/html/latest-damon/index.html for
+	  more information.
+	  If unsure, say N.
+
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index fccd3756b25f..230e545b6e07 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -112,3 +112,4 @@  obj-$(CONFIG_MEMFD_CREATE) += memfd.o
 obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
 obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
 obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
+obj-$(CONFIG_DAMON) += damon.o
diff --git a/mm/damon.c b/mm/damon.c
new file mode 100644
index 000000000000..5ab13b1c15cf
--- /dev/null
+++ b/mm/damon.c
@@ -0,0 +1,188 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Data Access Monitor
+ *
+ * Copyright 2019-2020 Amazon.com, Inc. or its affiliates.
+ *
+ * Author: SeongJae Park <sjpark@amazon.de>
+ *
+ * This file is constructed in below parts.
+ *
+ * - Functions and macros for DAMON data structures
+ * - Functions for the module loading/unloading
+ *
+ * The core parts are not implemented yet.
+ */
+
+#define pr_fmt(fmt) "damon: " fmt
+
+#include <linux/damon.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+/*
+ * Functions and macros for DAMON data structures
+ */
+
+#define damon_get_task_struct(t) \
+	(get_pid_task(find_vpid(t->pid), PIDTYPE_PID))
+
+#define damon_next_region(r) \
+	(container_of(r->list.next, struct damon_region, list))
+
+#define damon_prev_region(r) \
+	(container_of(r->list.prev, struct damon_region, list))
+
+#define damon_for_each_region(r, t) \
+	list_for_each_entry(r, &t->regions_list, list)
+
+#define damon_for_each_region_safe(r, next, t) \
+	list_for_each_entry_safe(r, next, &t->regions_list, list)
+
+#define damon_for_each_task(t, ctx) \
+	list_for_each_entry(t, &(ctx)->tasks_list, list)
+
+#define damon_for_each_task_safe(t, next, ctx) \
+	list_for_each_entry_safe(t, next, &(ctx)->tasks_list, list)
+
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32() % (r - l))
+
+/*
+ * Construct a damon_region struct
+ *
+ * Returns the pointer to the new struct if success, or NULL otherwise
+ */
+static struct damon_region *damon_new_region(unsigned long start,
+					     unsigned long end)
+{
+	struct damon_region *region;
+
+	region = kmalloc(sizeof(*region), GFP_KERNEL);
+	if (!region)
+		return NULL;
+
+	region->ar.start = start;
+	region->ar.end = end;
+	region->nr_accesses = 0;
+	INIT_LIST_HEAD(&region->list);
+
+	return region;
+}
+
+/*
+ * Add a region between two other regions
+ */
+static inline void damon_insert_region(struct damon_region *r,
+		struct damon_region *prev, struct damon_region *next)
+{
+	__list_add(&r->list, &prev->list, &next->list);
+}
+
+static void damon_add_region(struct damon_region *r, struct damon_task *t)
+{
+	list_add_tail(&r->list, &t->regions_list);
+}
+
+static void damon_del_region(struct damon_region *r)
+{
+	list_del(&r->list);
+}
+
+static void damon_free_region(struct damon_region *r)
+{
+	kfree(r);
+}
+
+static void damon_destroy_region(struct damon_region *r)
+{
+	damon_del_region(r);
+	damon_free_region(r);
+}
+
+/*
+ * Construct a damon_task struct
+ *
+ * Returns the pointer to the new struct if success, or NULL otherwise
+ */
+static struct damon_task *damon_new_task(int pid)
+{
+	struct damon_task *t;
+
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		return NULL;
+
+	t->pid = pid;
+	INIT_LIST_HEAD(&t->regions_list);
+
+	return t;
+}
+
+static void damon_add_task(struct damon_ctx *ctx, struct damon_task *t)
+{
+	list_add_tail(&t->list, &ctx->tasks_list);
+}
+
+static void damon_del_task(struct damon_task *t)
+{
+	list_del(&t->list);
+}
+
+static void damon_free_task(struct damon_task *t)
+{
+	struct damon_region *r, *next;
+
+	damon_for_each_region_safe(r, next, t)
+		damon_free_region(r);
+	kfree(t);
+}
+
+static void damon_destroy_task(struct damon_task *t)
+{
+	damon_del_task(t);
+	damon_free_task(t);
+}
+
+static unsigned int nr_damon_tasks(struct damon_ctx *ctx)
+{
+	struct damon_task *t;
+	unsigned int nr_tasks = 0;
+
+	damon_for_each_task(t, ctx)
+		nr_tasks++;
+
+	return nr_tasks;
+}
+
+static unsigned int nr_damon_regions(struct damon_task *t)
+{
+	struct damon_region *r;
+	unsigned int nr_regions = 0;
+
+	damon_for_each_region(r, t)
+		nr_regions++;
+
+	return nr_regions;
+}
+
+/*
+ * Functions for the module loading/unloading
+ */
+
+static int __init damon_init(void)
+{
+	return 0;
+}
+
+static void __exit damon_exit(void)
+{
+}
+
+module_init(damon_init);
+module_exit(damon_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("SeongJae Park <sjpark@amazon.de>");
+MODULE_DESCRIPTION("DAMON: Data Access MONitor");