diff mbox series

mm: memcontrol: Add the missing numa stat of anon and file for cgroup v2

Message ID 20200910084258.22293-1-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series mm: memcontrol: Add the missing numa stat of anon and file for cgroup v2 | expand

Commit Message

Muchun Song Sept. 10, 2020, 8:42 a.m. UTC
In the cgroup v1, we have a numa_stat interface. This is useful for
providing visibility into the numa locality information within an
memcg since the pages are allowed to be allocated from any physical
node. One of the use cases is evaluating application performance by
combining this information with the application's CPU allocation.
But the cgroup v2 does not. So this patch adds the missing information.

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 44 insertions(+), 2 deletions(-)

Comments

Shakeel Butt Sept. 10, 2020, 4:01 p.m. UTC | #1
On Thu, Sep 10, 2020 at 1:46 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> In the cgroup v1, we have a numa_stat interface. This is useful for
> providing visibility into the numa locality information within an
> memcg since the pages are allowed to be allocated from any physical
> node. One of the use cases is evaluating application performance by
> combining this information with the application's CPU allocation.
> But the cgroup v2 does not. So this patch adds the missing information.
>
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---

I am actually working on exposing this info on v2 as well.

>  mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 44 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 75cd1a1e66c8..c779673f29b2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1492,10 +1492,34 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
>         return false;
>  }
>
> +#ifdef CONFIG_NUMA
> +static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
> +                                          unsigned int nid,
> +                                          enum node_stat_item idx)
> +{
> +       long x;
> +       struct mem_cgroup_per_node *pn;
> +       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> +
> +       VM_BUG_ON(nid >= nr_node_ids);
> +
> +       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +       x = atomic_long_read(&pn->lruvec_stat[idx]);
> +#ifdef CONFIG_SMP
> +       if (x < 0)
> +               x = 0;
> +#endif
> +       return x;
> +}
> +#endif
> +
>  static char *memory_stat_format(struct mem_cgroup *memcg)
>  {
>         struct seq_buf s;
>         int i;
> +#ifdef CONFIG_NUMA
> +       int nid;
> +#endif
>
>         seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
>         if (!s.buffer)
> @@ -1512,12 +1536,30 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
>          * Current memory state:
>          */
>

Let's not break the parsers of memory.stat. I would prefer a separate
interface like v1 i.e. memory.numa_stat.

> -       seq_buf_printf(&s, "anon %llu\n",
> +       seq_buf_printf(&s, "anon %llu",
>                        (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
>                        PAGE_SIZE);
> -       seq_buf_printf(&s, "file %llu\n",
> +#ifdef CONFIG_NUMA
> +       for_each_node_state(nid, N_MEMORY)
> +               seq_buf_printf(&s, " N%d=%llu", nid,
> +                              (u64)memcg_node_page_state(memcg, nid,
> +                                                         NR_ANON_MAPPED) *
> +                              PAGE_SIZE);
> +#endif
> +       seq_buf_putc(&s, '\n');
> +
> +       seq_buf_printf(&s, "file %llu",
>                        (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
>                        PAGE_SIZE);
> +#ifdef CONFIG_NUMA
> +       for_each_node_state(nid, N_MEMORY)
> +               seq_buf_printf(&s, " N%d=%llu", nid,
> +                              (u64)memcg_node_page_state(memcg, nid,
> +                                                         NR_FILE_PAGES) *
> +                              PAGE_SIZE);
> +#endif
> +       seq_buf_putc(&s, '\n');
> +

The v1's numa_stat exposes the LRUs, why NR_ANON_MAPPED and NR_FILE_PAGES?

Also I think exposing slab_[un]reclaimable per node would be beneficial as well.

>         seq_buf_printf(&s, "kernel_stack %llu\n",
>                        (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
>                        1024);
> --
> 2.20.1
>
Muchun Song Sept. 11, 2020, 3:51 a.m. UTC | #2
On Fri, Sep 11, 2020 at 12:02 AM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Thu, Sep 10, 2020 at 1:46 AM Muchun Song <songmuchun@bytedance.com> wrote:
> >
> > In the cgroup v1, we have a numa_stat interface. This is useful for
> > providing visibility into the numa locality information within an
> > memcg since the pages are allowed to be allocated from any physical
> > node. One of the use cases is evaluating application performance by
> > combining this information with the application's CPU allocation.
> > But the cgroup v2 does not. So this patch adds the missing information.
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
>
> I am actually working on exposing this info on v2 as well.
>
> >  mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
> >  1 file changed, 44 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index 75cd1a1e66c8..c779673f29b2 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -1492,10 +1492,34 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
> >         return false;
> >  }
> >
> > +#ifdef CONFIG_NUMA
> > +static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
> > +                                          unsigned int nid,
> > +                                          enum node_stat_item idx)
> > +{
> > +       long x;
> > +       struct mem_cgroup_per_node *pn;
> > +       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > +
> > +       VM_BUG_ON(nid >= nr_node_ids);
> > +
> > +       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > +       x = atomic_long_read(&pn->lruvec_stat[idx]);
> > +#ifdef CONFIG_SMP
> > +       if (x < 0)
> > +               x = 0;
> > +#endif
> > +       return x;
> > +}
> > +#endif
> > +
> >  static char *memory_stat_format(struct mem_cgroup *memcg)
> >  {
> >         struct seq_buf s;
> >         int i;
> > +#ifdef CONFIG_NUMA
> > +       int nid;
> > +#endif
> >
> >         seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
> >         if (!s.buffer)
> > @@ -1512,12 +1536,30 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
> >          * Current memory state:
> >          */
> >
>
> Let's not break the parsers of memory.stat. I would prefer a separate
> interface like v1 i.e. memory.numa_stat.

It is also a good idea to expose a new interface like memory.numa_stat.

>
> > -       seq_buf_printf(&s, "anon %llu\n",
> > +       seq_buf_printf(&s, "anon %llu",
> >                        (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
> >                        PAGE_SIZE);
> > -       seq_buf_printf(&s, "file %llu\n",
> > +#ifdef CONFIG_NUMA
> > +       for_each_node_state(nid, N_MEMORY)
> > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > +                              (u64)memcg_node_page_state(memcg, nid,
> > +                                                         NR_ANON_MAPPED) *
> > +                              PAGE_SIZE);
> > +#endif
> > +       seq_buf_putc(&s, '\n');
> > +
> > +       seq_buf_printf(&s, "file %llu",
> >                        (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
> >                        PAGE_SIZE);
> > +#ifdef CONFIG_NUMA
> > +       for_each_node_state(nid, N_MEMORY)
> > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > +                              (u64)memcg_node_page_state(memcg, nid,
> > +                                                         NR_FILE_PAGES) *
> > +                              PAGE_SIZE);
> > +#endif
> > +       seq_buf_putc(&s, '\n');
> > +
>
> The v1's numa_stat exposes the LRUs, why NR_ANON_MAPPED and NR_FILE_PAGES?

If we want to expose the anon per node, we need to add inactive anon and
active anon together. Why not use NR_ANON_MAPPED directly?

>
> Also I think exposing slab_[un]reclaimable per node would be beneficial as well.

Yeah, I agree with you. Maybe kernel_stack and percpu also should
be exposed.

>
> >         seq_buf_printf(&s, "kernel_stack %llu\n",
> >                        (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
> >                        1024);
> > --
> > 2.20.1
> >
Shakeel Butt Sept. 11, 2020, 2:55 p.m. UTC | #3
On Thu, Sep 10, 2020 at 8:52 PM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Fri, Sep 11, 2020 at 12:02 AM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Thu, Sep 10, 2020 at 1:46 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > >
> > > In the cgroup v1, we have a numa_stat interface. This is useful for
> > > providing visibility into the numa locality information within an
> > > memcg since the pages are allowed to be allocated from any physical
> > > node. One of the use cases is evaluating application performance by
> > > combining this information with the application's CPU allocation.
> > > But the cgroup v2 does not. So this patch adds the missing information.
> > >
> > > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > > ---
> >
> > I am actually working on exposing this info on v2 as well.
> >
> > >  mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 44 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 75cd1a1e66c8..c779673f29b2 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -1492,10 +1492,34 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
> > >         return false;
> > >  }
> > >
> > > +#ifdef CONFIG_NUMA
> > > +static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
> > > +                                          unsigned int nid,
> > > +                                          enum node_stat_item idx)
> > > +{
> > > +       long x;
> > > +       struct mem_cgroup_per_node *pn;
> > > +       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > +
> > > +       VM_BUG_ON(nid >= nr_node_ids);
> > > +
> > > +       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > > +       x = atomic_long_read(&pn->lruvec_stat[idx]);
> > > +#ifdef CONFIG_SMP
> > > +       if (x < 0)
> > > +               x = 0;
> > > +#endif
> > > +       return x;
> > > +}
> > > +#endif
> > > +
> > >  static char *memory_stat_format(struct mem_cgroup *memcg)
> > >  {
> > >         struct seq_buf s;
> > >         int i;
> > > +#ifdef CONFIG_NUMA
> > > +       int nid;
> > > +#endif
> > >
> > >         seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
> > >         if (!s.buffer)
> > > @@ -1512,12 +1536,30 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
> > >          * Current memory state:
> > >          */
> > >
> >
> > Let's not break the parsers of memory.stat. I would prefer a separate
> > interface like v1 i.e. memory.numa_stat.
>
> It is also a good idea to expose a new interface like memory.numa_stat.
>
> >
> > > -       seq_buf_printf(&s, "anon %llu\n",
> > > +       seq_buf_printf(&s, "anon %llu",
> > >                        (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
> > >                        PAGE_SIZE);
> > > -       seq_buf_printf(&s, "file %llu\n",
> > > +#ifdef CONFIG_NUMA
> > > +       for_each_node_state(nid, N_MEMORY)
> > > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > > +                              (u64)memcg_node_page_state(memcg, nid,
> > > +                                                         NR_ANON_MAPPED) *
> > > +                              PAGE_SIZE);
> > > +#endif
> > > +       seq_buf_putc(&s, '\n');
> > > +
> > > +       seq_buf_printf(&s, "file %llu",
> > >                        (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
> > >                        PAGE_SIZE);
> > > +#ifdef CONFIG_NUMA
> > > +       for_each_node_state(nid, N_MEMORY)
> > > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > > +                              (u64)memcg_node_page_state(memcg, nid,
> > > +                                                         NR_FILE_PAGES) *
> > > +                              PAGE_SIZE);
> > > +#endif
> > > +       seq_buf_putc(&s, '\n');
> > > +
> >
> > The v1's numa_stat exposes the LRUs, why NR_ANON_MAPPED and NR_FILE_PAGES?
>
> If we want to expose the anon per node, we need to add inactive anon and
> active anon together. Why not use NR_ANON_MAPPED directly?
>

Active anon plus inactive anon is not equal to NR_ANON_MAPPED. The
shmem related memory is on anon LRUs but not accounted in
NR_ANON_MAPPED.

Similarly file LRU can contain MADV_FREE pages which are not accounted
in NR_FILE_PAGES.

> >
> > Also I think exposing slab_[un]reclaimable per node would be beneficial as well.
>
> Yeah, I agree with you. Maybe kernel_stack and percpu also should
> be exposed.
>
> >
> > >         seq_buf_printf(&s, "kernel_stack %llu\n",
> > >                        (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
> > >                        1024);
> > > --
> > > 2.20.1
> > >
>
>
>
> --
> Yours,
> Muchun
Muchun Song Sept. 11, 2020, 3:47 p.m. UTC | #4
On Fri, Sep 11, 2020 at 10:55 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Thu, Sep 10, 2020 at 8:52 PM Muchun Song <songmuchun@bytedance.com> wrote:
> >
> > On Fri, Sep 11, 2020 at 12:02 AM Shakeel Butt <shakeelb@google.com> wrote:
> > >
> > > On Thu, Sep 10, 2020 at 1:46 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > > >
> > > > In the cgroup v1, we have a numa_stat interface. This is useful for
> > > > providing visibility into the numa locality information within an
> > > > memcg since the pages are allowed to be allocated from any physical
> > > > node. One of the use cases is evaluating application performance by
> > > > combining this information with the application's CPU allocation.
> > > > But the cgroup v2 does not. So this patch adds the missing information.
> > > >
> > > > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > > > ---
> > >
> > > I am actually working on exposing this info on v2 as well.
> > >
> > > >  mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
> > > >  1 file changed, 44 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > > index 75cd1a1e66c8..c779673f29b2 100644
> > > > --- a/mm/memcontrol.c
> > > > +++ b/mm/memcontrol.c
> > > > @@ -1492,10 +1492,34 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
> > > >         return false;
> > > >  }
> > > >
> > > > +#ifdef CONFIG_NUMA
> > > > +static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
> > > > +                                          unsigned int nid,
> > > > +                                          enum node_stat_item idx)
> > > > +{
> > > > +       long x;
> > > > +       struct mem_cgroup_per_node *pn;
> > > > +       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > > +
> > > > +       VM_BUG_ON(nid >= nr_node_ids);
> > > > +
> > > > +       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > > > +       x = atomic_long_read(&pn->lruvec_stat[idx]);
> > > > +#ifdef CONFIG_SMP
> > > > +       if (x < 0)
> > > > +               x = 0;
> > > > +#endif
> > > > +       return x;
> > > > +}
> > > > +#endif
> > > > +
> > > >  static char *memory_stat_format(struct mem_cgroup *memcg)
> > > >  {
> > > >         struct seq_buf s;
> > > >         int i;
> > > > +#ifdef CONFIG_NUMA
> > > > +       int nid;
> > > > +#endif
> > > >
> > > >         seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
> > > >         if (!s.buffer)
> > > > @@ -1512,12 +1536,30 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
> > > >          * Current memory state:
> > > >          */
> > > >
> > >
> > > Let's not break the parsers of memory.stat. I would prefer a separate
> > > interface like v1 i.e. memory.numa_stat.
> >
> > It is also a good idea to expose a new interface like memory.numa_stat.
> >
> > >
> > > > -       seq_buf_printf(&s, "anon %llu\n",
> > > > +       seq_buf_printf(&s, "anon %llu",
> > > >                        (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
> > > >                        PAGE_SIZE);
> > > > -       seq_buf_printf(&s, "file %llu\n",
> > > > +#ifdef CONFIG_NUMA
> > > > +       for_each_node_state(nid, N_MEMORY)
> > > > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > > > +                              (u64)memcg_node_page_state(memcg, nid,
> > > > +                                                         NR_ANON_MAPPED) *
> > > > +                              PAGE_SIZE);
> > > > +#endif
> > > > +       seq_buf_putc(&s, '\n');
> > > > +
> > > > +       seq_buf_printf(&s, "file %llu",
> > > >                        (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
> > > >                        PAGE_SIZE);
> > > > +#ifdef CONFIG_NUMA
> > > > +       for_each_node_state(nid, N_MEMORY)
> > > > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > > > +                              (u64)memcg_node_page_state(memcg, nid,
> > > > +                                                         NR_FILE_PAGES) *
> > > > +                              PAGE_SIZE);
> > > > +#endif
> > > > +       seq_buf_putc(&s, '\n');
> > > > +
> > >
> > > The v1's numa_stat exposes the LRUs, why NR_ANON_MAPPED and NR_FILE_PAGES?
> >
> > If we want to expose the anon per node, we need to add inactive anon and
> > active anon together. Why not use NR_ANON_MAPPED directly?
> >
>
> Active anon plus inactive anon is not equal to NR_ANON_MAPPED. The
> shmem related memory is on anon LRUs but not accounted in
> NR_ANON_MAPPED.
>
> Similarly file LRU can contain MADV_FREE pages which are not accounted
> in NR_FILE_PAGES.

I got it, thanks. Because the "state" interface exposes the anon and
file information. So I think that we also should expose the anon and
file for "numa_stat" per node instead of the lru statistics. Maybe it is
better that we expose both of all the information.

>
> > >
> > > Also I think exposing slab_[un]reclaimable per node would be beneficial as well.
> >
> > Yeah, I agree with you. Maybe kernel_stack and percpu also should
> > be exposed.
> >
> > >
> > > >         seq_buf_printf(&s, "kernel_stack %llu\n",
> > > >                        (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
> > > >                        1024);
> > > > --
> > > > 2.20.1
> > > >
> >
> >
> >
> > --
> > Yours,
> > Muchun
Shakeel Butt Sept. 11, 2020, 3:55 p.m. UTC | #5
On Fri, Sep 11, 2020 at 8:48 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
[snip]
>
> I got it, thanks. Because the "state" interface exposes the anon and
> file information. So I think that we also should expose the anon and
> file for "numa_stat" per node instead of the lru statistics. Maybe it is
> better that we expose both of all the information.
>

Sure, go ahead and please do update the doc file as well in the next version.
Roman Gushchin Sept. 11, 2020, 9:51 p.m. UTC | #6
On Fri, Sep 11, 2020 at 11:51:42AM +0800, Muchun Song wrote:
> On Fri, Sep 11, 2020 at 12:02 AM Shakeel Butt <shakeelb@google.com> wrote:
> >
> > On Thu, Sep 10, 2020 at 1:46 AM Muchun Song <songmuchun@bytedance.com> wrote:
> > >
> > > In the cgroup v1, we have a numa_stat interface. This is useful for
> > > providing visibility into the numa locality information within an
> > > memcg since the pages are allowed to be allocated from any physical
> > > node. One of the use cases is evaluating application performance by
> > > combining this information with the application's CPU allocation.
> > > But the cgroup v2 does not. So this patch adds the missing information.
> > >
> > > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > > ---
> >
> > I am actually working on exposing this info on v2 as well.
> >
> > >  mm/memcontrol.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
> > >  1 file changed, 44 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > > index 75cd1a1e66c8..c779673f29b2 100644
> > > --- a/mm/memcontrol.c
> > > +++ b/mm/memcontrol.c
> > > @@ -1492,10 +1492,34 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
> > >         return false;
> > >  }
> > >
> > > +#ifdef CONFIG_NUMA
> > > +static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
> > > +                                          unsigned int nid,
> > > +                                          enum node_stat_item idx)
> > > +{
> > > +       long x;
> > > +       struct mem_cgroup_per_node *pn;
> > > +       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
> > > +
> > > +       VM_BUG_ON(nid >= nr_node_ids);
> > > +
> > > +       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > > +       x = atomic_long_read(&pn->lruvec_stat[idx]);
> > > +#ifdef CONFIG_SMP
> > > +       if (x < 0)
> > > +               x = 0;
> > > +#endif
> > > +       return x;
> > > +}
> > > +#endif
> > > +
> > >  static char *memory_stat_format(struct mem_cgroup *memcg)
> > >  {
> > >         struct seq_buf s;
> > >         int i;
> > > +#ifdef CONFIG_NUMA
> > > +       int nid;
> > > +#endif
> > >
> > >         seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
> > >         if (!s.buffer)
> > > @@ -1512,12 +1536,30 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
> > >          * Current memory state:
> > >          */
> > >
> >
> > Let's not break the parsers of memory.stat. I would prefer a separate
> > interface like v1 i.e. memory.numa_stat.
> 
> It is also a good idea to expose a new interface like memory.numa_stat.
> 
> >
> > > -       seq_buf_printf(&s, "anon %llu\n",
> > > +       seq_buf_printf(&s, "anon %llu",
> > >                        (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
> > >                        PAGE_SIZE);
> > > -       seq_buf_printf(&s, "file %llu\n",
> > > +#ifdef CONFIG_NUMA
> > > +       for_each_node_state(nid, N_MEMORY)
> > > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > > +                              (u64)memcg_node_page_state(memcg, nid,
> > > +                                                         NR_ANON_MAPPED) *
> > > +                              PAGE_SIZE);
> > > +#endif
> > > +       seq_buf_putc(&s, '\n');
> > > +
> > > +       seq_buf_printf(&s, "file %llu",
> > >                        (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
> > >                        PAGE_SIZE);
> > > +#ifdef CONFIG_NUMA
> > > +       for_each_node_state(nid, N_MEMORY)
> > > +               seq_buf_printf(&s, " N%d=%llu", nid,
> > > +                              (u64)memcg_node_page_state(memcg, nid,
> > > +                                                         NR_FILE_PAGES) *
> > > +                              PAGE_SIZE);
> > > +#endif
> > > +       seq_buf_putc(&s, '\n');
> > > +
> >
> > The v1's numa_stat exposes the LRUs, why NR_ANON_MAPPED and NR_FILE_PAGES?
> 
> If we want to expose the anon per node, we need to add inactive anon and
> active anon together. Why not use NR_ANON_MAPPED directly?
> 
> >
> > Also I think exposing slab_[un]reclaimable per node would be beneficial as well.
> 
> Yeah, I agree with you. Maybe kernel_stack and percpu also should
> be exposed.

Percpu allocations are usually spread over multiple pages and numa nodes,
so there are no per-node pepcpu counters.

Thanks!

> 
> >
> > >         seq_buf_printf(&s, "kernel_stack %llu\n",
> > >                        (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
> > >                        1024);
> > > --
> > > 2.20.1
> > >
> 
> 
> 
> -- 
> Yours,
> Muchun
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 75cd1a1e66c8..c779673f29b2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1492,10 +1492,34 @@  static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 	return false;
 }
 
+#ifdef CONFIG_NUMA
+static unsigned long memcg_node_page_state(struct mem_cgroup *memcg,
+					   unsigned int nid,
+					   enum node_stat_item idx)
+{
+	long x;
+	struct mem_cgroup_per_node *pn;
+	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+
+	VM_BUG_ON(nid >= nr_node_ids);
+
+	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+	x = atomic_long_read(&pn->lruvec_stat[idx]);
+#ifdef CONFIG_SMP
+	if (x < 0)
+		x = 0;
+#endif
+	return x;
+}
+#endif
+
 static char *memory_stat_format(struct mem_cgroup *memcg)
 {
 	struct seq_buf s;
 	int i;
+#ifdef CONFIG_NUMA
+	int nid;
+#endif
 
 	seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
 	if (!s.buffer)
@@ -1512,12 +1536,30 @@  static char *memory_stat_format(struct mem_cgroup *memcg)
 	 * Current memory state:
 	 */
 
-	seq_buf_printf(&s, "anon %llu\n",
+	seq_buf_printf(&s, "anon %llu",
 		       (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
 		       PAGE_SIZE);
-	seq_buf_printf(&s, "file %llu\n",
+#ifdef CONFIG_NUMA
+	for_each_node_state(nid, N_MEMORY)
+		seq_buf_printf(&s, " N%d=%llu", nid,
+			       (u64)memcg_node_page_state(memcg, nid,
+							  NR_ANON_MAPPED) *
+			       PAGE_SIZE);
+#endif
+	seq_buf_putc(&s, '\n');
+
+	seq_buf_printf(&s, "file %llu",
 		       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
 		       PAGE_SIZE);
+#ifdef CONFIG_NUMA
+	for_each_node_state(nid, N_MEMORY)
+		seq_buf_printf(&s, " N%d=%llu", nid,
+			       (u64)memcg_node_page_state(memcg, nid,
+							  NR_FILE_PAGES) *
+			       PAGE_SIZE);
+#endif
+	seq_buf_putc(&s, '\n');
+
 	seq_buf_printf(&s, "kernel_stack %llu\n",
 		       (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
 		       1024);