diff mbox series

[v3,4/8] memcg: reduce memory for the lruvec and memcg stats

Message ID 20240430060612.2171650-5-shakeel.butt@linux.dev (mailing list archive)
State New
Headers show
Series memcg: reduce memory consumption by memcg stats | expand

Commit Message

Shakeel Butt April 30, 2024, 6:06 a.m. UTC
At the moment, the amount of memory allocated for stats related structs
in the mem_cgroup corresponds to the size of enum node_stat_item.
However not all fields in enum node_stat_item has corresponding memcg
stats. So, let's use indirection mechanism similar to the one used for
memcg vmstats management.

For a given x86_64 config, the size of stats with and without patch is:

structs size in bytes         w/o     with

struct lruvec_stats           1128     648
struct lruvec_stats_percpu     752     432
struct memcg_vmstats          1832    1352
struct memcg_vmstats_percpu   1280     960

The memory savings is further compounded by the fact that these structs
are allocated for each cpu and for each node. To be precise, for each
memcg the memory saved would be:

Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +
	       (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)

Where 21 is the number of fields eliminated.

Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
---

Changes since v2:
- N/A

 mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 115 insertions(+), 23 deletions(-)

Comments

Yosry Ahmed April 30, 2024, 8:41 a.m. UTC | #1
On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg
> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
>
> For a given x86_64 config, the size of stats with and without patch is:
>
> structs size in bytes         w/o     with
>
> struct lruvec_stats           1128     648
> struct lruvec_stats_percpu     752     432
> struct memcg_vmstats          1832    1352
> struct memcg_vmstats_percpu   1280     960
>
> The memory savings is further compounded by the fact that these structs
> are allocated for each cpu and for each node. To be precise, for each
> memcg the memory saved would be:
>
> Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +
>                (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
>
> Where 21 is the number of fields eliminated.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>
> Changes since v2:
> - N/A
>
>  mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 434cff91b65e..f424c5b2ba9b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
>         return mz;
>  }
>
> +/* Subset of node_stat_item for memcg stats */
> +static const unsigned int memcg_node_stat_items[] = {
> +       NR_INACTIVE_ANON,
> +       NR_ACTIVE_ANON,
> +       NR_INACTIVE_FILE,
> +       NR_ACTIVE_FILE,
> +       NR_UNEVICTABLE,
> +       NR_SLAB_RECLAIMABLE_B,
> +       NR_SLAB_UNRECLAIMABLE_B,
> +       WORKINGSET_REFAULT_ANON,
> +       WORKINGSET_REFAULT_FILE,
> +       WORKINGSET_ACTIVATE_ANON,
> +       WORKINGSET_ACTIVATE_FILE,
> +       WORKINGSET_RESTORE_ANON,
> +       WORKINGSET_RESTORE_FILE,
> +       WORKINGSET_NODERECLAIM,
> +       NR_ANON_MAPPED,
> +       NR_FILE_MAPPED,
> +       NR_FILE_PAGES,
> +       NR_FILE_DIRTY,
> +       NR_WRITEBACK,
> +       NR_SHMEM,
> +       NR_SHMEM_THPS,
> +       NR_FILE_THPS,
> +       NR_ANON_THPS,
> +       NR_KERNEL_STACK_KB,
> +       NR_PAGETABLE,
> +       NR_SECONDARY_PAGETABLE,
> +#ifdef CONFIG_SWAP
> +       NR_SWAPCACHE,
> +#endif
> +};
> +
> +static const unsigned int memcg_stat_items[] = {
> +       MEMCG_SWAP,
> +       MEMCG_SOCK,
> +       MEMCG_PERCPU_B,
> +       MEMCG_VMALLOC,
> +       MEMCG_KMEM,
> +       MEMCG_ZSWAP_B,
> +       MEMCG_ZSWAPPED,
> +};
> +
> +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;

NR_MEMCG_STATS and MEMCG_NR_STAT are awfully close and have different
meanings. I think we should come up with better names (sorry nothing
comes to mind) or add a comment to make the difference more obvious.

> +
> +static void init_memcg_stats(void)
> +{
> +       int8_t i, j = 0;
> +
> +       /* Switch to short once this failure occurs. */
> +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);

Should we use S8_MAX here too?

> +
> +       for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
> +               mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
> +
> +       for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
> +               mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
> +}
> +
> +static inline int memcg_stats_index(int idx)
> +{
> +       return mem_cgroup_stats_index[idx] - 1;
> +}
> +
>  struct lruvec_stats_percpu {
>         /* Local (CPU and cgroup) state */
> -       long state[NR_VM_NODE_STAT_ITEMS];
> +       long state[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Delta calculation for lockless upward propagation */
> -       long state_prev[NR_VM_NODE_STAT_ITEMS];
> +       long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
>  };
>
>  struct lruvec_stats {
>         /* Aggregated (CPU and subtree) state */
> -       long state[NR_VM_NODE_STAT_ITEMS];
> +       long state[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Non-hierarchical (CPU aggregated) state */
> -       long state_local[NR_VM_NODE_STAT_ITEMS];
> +       long state_local[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Pending child counts during tree propagation */
> -       long state_pending[NR_VM_NODE_STAT_ITEMS];
> +       long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
>  };
>
>  unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
>  {
>         struct mem_cgroup_per_node *pn;
> -       long x;
> +       long x = 0;
> +       int i;
>
>         if (mem_cgroup_disabled())
>                 return node_page_state(lruvec_pgdat(lruvec), idx);
>
> -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -       x = READ_ONCE(pn->lruvec_stats->state[idx]);
> +       i = memcg_stats_index(idx);
> +       if (i >= 0) {

nit: we could return here if (i < 0) like you did in
memcg_page_state() and others below, less indentation. Same for
lruvec_page_state_local().

> +               pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +               x = READ_ONCE(pn->lruvec_stats->state[i]);
> +       }
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -617,12 +687,16 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  {
>         struct mem_cgroup_per_node *pn;
>         long x = 0;
> +       int i;
>
>         if (mem_cgroup_disabled())
>                 return node_page_state(lruvec_pgdat(lruvec), idx);
>
> -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -       x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> +       i = memcg_stats_index(idx);
> +       if (i >= 0) {
> +               pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +               x = READ_ONCE(pn->lruvec_stats->state_local[i]);
> +       }
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -689,11 +763,11 @@ struct memcg_vmstats_percpu {
>         /* The above should fit a single cacheline for memcg_rstat_updated() */
>
>         /* Local (CPU and cgroup) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> +       long                    state[NR_MEMCG_STATS];
>         unsigned long           events[NR_MEMCG_EVENTS];
>
>         /* Delta calculation for lockless upward propagation */
> -       long                    state_prev[MEMCG_NR_STAT];
> +       long                    state_prev[NR_MEMCG_STATS];
>         unsigned long           events_prev[NR_MEMCG_EVENTS];
>
>         /* Cgroup1: threshold notifications & softlimit tree updates */
> @@ -703,15 +777,15 @@ struct memcg_vmstats_percpu {
>
>  struct memcg_vmstats {
>         /* Aggregated (CPU and subtree) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> +       long                    state[NR_MEMCG_STATS];
>         unsigned long           events[NR_MEMCG_EVENTS];
>
>         /* Non-hierarchical (CPU aggregated) page state & events */
> -       long                    state_local[MEMCG_NR_STAT];
> +       long                    state_local[NR_MEMCG_STATS];
>         unsigned long           events_local[NR_MEMCG_EVENTS];
>
>         /* Pending child counts during tree propagation */
> -       long                    state_pending[MEMCG_NR_STAT];
> +       long                    state_pending[NR_MEMCG_STATS];
>         unsigned long           events_pending[NR_MEMCG_EVENTS];
>
>         /* Stats updates since the last flush */
> @@ -844,7 +918,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>
>  unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
>  {
> -       long x = READ_ONCE(memcg->vmstats->state[idx]);
> +       long x;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return 0;
> +
> +       x = READ_ONCE(memcg->vmstats->state[i]);
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -876,18 +956,25 @@ static int memcg_state_val_in_pages(int idx, int val)
>   */
>  void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
>  {
> -       if (mem_cgroup_disabled())
> +       int i = memcg_stats_index(idx);
> +
> +       if (mem_cgroup_disabled() || i < 0)
>                 return;
>
> -       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> +       __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>         memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
>  }
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item. */
>  static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
>  {
> -       long x = READ_ONCE(memcg->vmstats->state_local[idx]);
> +       long x;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return 0;
>
> +       x = READ_ONCE(memcg->vmstats->state_local[i]);
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -901,6 +988,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>  {
>         struct mem_cgroup_per_node *pn;
>         struct mem_cgroup *memcg;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return;
>
>         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
>         memcg = pn->memcg;
> @@ -930,10 +1021,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>         }
>
>         /* Update memcg */
> -       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> +       __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>
>         /* Update lruvec */
> -       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
> +       __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
>
>         memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
>         memcg_stats_unlock();
> @@ -5702,6 +5793,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>                 page_counter_init(&memcg->kmem, &parent->kmem);
>                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
>         } else {
> +               init_memcg_stats();
>                 init_memcg_events();
>                 page_counter_init(&memcg->memory, NULL);
>                 page_counter_init(&memcg->swap, NULL);
> @@ -5873,7 +5965,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
>         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
>
> -       for (i = 0; i < MEMCG_NR_STAT; i++) {
> +       for (i = 0; i < NR_MEMCG_STATS; i++) {
>                 /*
>                  * Collect the aggregated propagation counts of groups
>                  * below us. We're in a per-cpu loop here and this is
> @@ -5937,7 +6029,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
>                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>
> -               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> +               for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
>                         delta = lstats->state_pending[i];
>                         if (delta)
>                                 lstats->state_pending[i] = 0;
> --
> 2.43.0
>
T.J. Mercier April 30, 2024, 5:30 p.m. UTC | #2
On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> At the moment, the amount of memory allocated for stats related structs
> in the mem_cgroup corresponds to the size of enum node_stat_item.
> However not all fields in enum node_stat_item has corresponding memcg

typo: "have corresponding"

> stats. So, let's use indirection mechanism similar to the one used for
> memcg vmstats management.
>
> For a given x86_64 config, the size of stats with and without patch is:
>
> structs size in bytes         w/o     with
>
> struct lruvec_stats           1128     648
> struct lruvec_stats_percpu     752     432
> struct memcg_vmstats          1832    1352
> struct memcg_vmstats_percpu   1280     960
>
> The memory savings is further compounded by the fact that these structs
> are allocated for each cpu and for each node. To be precise, for each
> memcg the memory saved would be:
>
> Memory saved = ((21 * 3 * NR_NODES) + (21 * 2 * NR_NODS * NR_CPUS) +

typo: "NR_NODES"

>                (21 * 3) + (21 * 2 * NR_CPUS)) * sizeof(long)
>
> Where 21 is the number of fields eliminated.
>
> Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
> ---
>
> Changes since v2:
> - N/A
>
>  mm/memcontrol.c | 138 ++++++++++++++++++++++++++++++++++++++++--------
>  1 file changed, 115 insertions(+), 23 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 434cff91b65e..f424c5b2ba9b 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -576,35 +576,105 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
>         return mz;
>  }
>
> +/* Subset of node_stat_item for memcg stats */
> +static const unsigned int memcg_node_stat_items[] = {
> +       NR_INACTIVE_ANON,
> +       NR_ACTIVE_ANON,
> +       NR_INACTIVE_FILE,
> +       NR_ACTIVE_FILE,
> +       NR_UNEVICTABLE,
> +       NR_SLAB_RECLAIMABLE_B,
> +       NR_SLAB_UNRECLAIMABLE_B,
> +       WORKINGSET_REFAULT_ANON,
> +       WORKINGSET_REFAULT_FILE,
> +       WORKINGSET_ACTIVATE_ANON,
> +       WORKINGSET_ACTIVATE_FILE,
> +       WORKINGSET_RESTORE_ANON,
> +       WORKINGSET_RESTORE_FILE,
> +       WORKINGSET_NODERECLAIM,
> +       NR_ANON_MAPPED,
> +       NR_FILE_MAPPED,
> +       NR_FILE_PAGES,
> +       NR_FILE_DIRTY,
> +       NR_WRITEBACK,
> +       NR_SHMEM,
> +       NR_SHMEM_THPS,
> +       NR_FILE_THPS,
> +       NR_ANON_THPS,
> +       NR_KERNEL_STACK_KB,
> +       NR_PAGETABLE,
> +       NR_SECONDARY_PAGETABLE,
> +#ifdef CONFIG_SWAP
> +       NR_SWAPCACHE,
> +#endif
> +};
> +
> +static const unsigned int memcg_stat_items[] = {
> +       MEMCG_SWAP,
> +       MEMCG_SOCK,
> +       MEMCG_PERCPU_B,
> +       MEMCG_VMALLOC,
> +       MEMCG_KMEM,
> +       MEMCG_ZSWAP_B,
> +       MEMCG_ZSWAPPED,
> +};
> +
> +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> +
> +static void init_memcg_stats(void)
> +{
> +       int8_t i, j = 0;
> +
> +       /* Switch to short once this failure occurs. */
> +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> +
> +       for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
> +               mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
> +
> +       for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
> +               mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
> +}
> +
> +static inline int memcg_stats_index(int idx)
> +{
> +       return mem_cgroup_stats_index[idx] - 1;

Could this just be: return mem_cgroup_stats_index[idx];
with a postfix increment of j in init_memcg_stats instead of prefix increment?


> +}
> +
>  struct lruvec_stats_percpu {
>         /* Local (CPU and cgroup) state */
> -       long state[NR_VM_NODE_STAT_ITEMS];
> +       long state[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Delta calculation for lockless upward propagation */
> -       long state_prev[NR_VM_NODE_STAT_ITEMS];
> +       long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
>  };
>
>  struct lruvec_stats {
>         /* Aggregated (CPU and subtree) state */
> -       long state[NR_VM_NODE_STAT_ITEMS];
> +       long state[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Non-hierarchical (CPU aggregated) state */
> -       long state_local[NR_VM_NODE_STAT_ITEMS];
> +       long state_local[NR_MEMCG_NODE_STAT_ITEMS];
>
>         /* Pending child counts during tree propagation */
> -       long state_pending[NR_VM_NODE_STAT_ITEMS];
> +       long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
>  };
>
>  unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
>  {
>         struct mem_cgroup_per_node *pn;
> -       long x;
> +       long x = 0;
> +       int i;
>
>         if (mem_cgroup_disabled())
>                 return node_page_state(lruvec_pgdat(lruvec), idx);
>
> -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -       x = READ_ONCE(pn->lruvec_stats->state[idx]);
> +       i = memcg_stats_index(idx);
> +       if (i >= 0) {
> +               pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +               x = READ_ONCE(pn->lruvec_stats->state[i]);
> +       }
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -617,12 +687,16 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
>  {
>         struct mem_cgroup_per_node *pn;
>         long x = 0;
> +       int i;
>
>         if (mem_cgroup_disabled())
>                 return node_page_state(lruvec_pgdat(lruvec), idx);
>
> -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> -       x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
> +       i = memcg_stats_index(idx);
> +       if (i >= 0) {
> +               pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> +               x = READ_ONCE(pn->lruvec_stats->state_local[i]);
> +       }
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -689,11 +763,11 @@ struct memcg_vmstats_percpu {
>         /* The above should fit a single cacheline for memcg_rstat_updated() */
>
>         /* Local (CPU and cgroup) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> +       long                    state[NR_MEMCG_STATS];
>         unsigned long           events[NR_MEMCG_EVENTS];
>
>         /* Delta calculation for lockless upward propagation */
> -       long                    state_prev[MEMCG_NR_STAT];
> +       long                    state_prev[NR_MEMCG_STATS];
>         unsigned long           events_prev[NR_MEMCG_EVENTS];
>
>         /* Cgroup1: threshold notifications & softlimit tree updates */
> @@ -703,15 +777,15 @@ struct memcg_vmstats_percpu {
>
>  struct memcg_vmstats {
>         /* Aggregated (CPU and subtree) page state & events */
> -       long                    state[MEMCG_NR_STAT];
> +       long                    state[NR_MEMCG_STATS];
>         unsigned long           events[NR_MEMCG_EVENTS];
>
>         /* Non-hierarchical (CPU aggregated) page state & events */
> -       long                    state_local[MEMCG_NR_STAT];
> +       long                    state_local[NR_MEMCG_STATS];
>         unsigned long           events_local[NR_MEMCG_EVENTS];
>
>         /* Pending child counts during tree propagation */
> -       long                    state_pending[MEMCG_NR_STAT];
> +       long                    state_pending[NR_MEMCG_STATS];
>         unsigned long           events_pending[NR_MEMCG_EVENTS];
>
>         /* Stats updates since the last flush */
> @@ -844,7 +918,13 @@ static void flush_memcg_stats_dwork(struct work_struct *w)
>
>  unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
>  {
> -       long x = READ_ONCE(memcg->vmstats->state[idx]);
> +       long x;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return 0;
> +
> +       x = READ_ONCE(memcg->vmstats->state[i]);
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -876,18 +956,25 @@ static int memcg_state_val_in_pages(int idx, int val)
>   */
>  void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
>  {
> -       if (mem_cgroup_disabled())
> +       int i = memcg_stats_index(idx);
> +
> +       if (mem_cgroup_disabled() || i < 0)
>                 return;
>
> -       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> +       __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>         memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
>  }
>
>  /* idx can be of type enum memcg_stat_item or node_stat_item. */
>  static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
>  {
> -       long x = READ_ONCE(memcg->vmstats->state_local[idx]);
> +       long x;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return 0;
>
> +       x = READ_ONCE(memcg->vmstats->state_local[i]);
>  #ifdef CONFIG_SMP
>         if (x < 0)
>                 x = 0;
> @@ -901,6 +988,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>  {
>         struct mem_cgroup_per_node *pn;
>         struct mem_cgroup *memcg;
> +       int i = memcg_stats_index(idx);
> +
> +       if (i < 0)
> +               return;
>
>         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
>         memcg = pn->memcg;
> @@ -930,10 +1021,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
>         }
>
>         /* Update memcg */
> -       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
> +       __this_cpu_add(memcg->vmstats_percpu->state[i], val);
>
>         /* Update lruvec */
> -       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
> +       __this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
>
>         memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
>         memcg_stats_unlock();
> @@ -5702,6 +5793,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
>                 page_counter_init(&memcg->kmem, &parent->kmem);
>                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
>         } else {
> +               init_memcg_stats();
>                 init_memcg_events();
>                 page_counter_init(&memcg->memory, NULL);
>                 page_counter_init(&memcg->swap, NULL);
> @@ -5873,7 +5965,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
>         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
>
> -       for (i = 0; i < MEMCG_NR_STAT; i++) {
> +       for (i = 0; i < NR_MEMCG_STATS; i++) {
>                 /*
>                  * Collect the aggregated propagation counts of groups
>                  * below us. We're in a per-cpu loop here and this is
> @@ -5937,7 +6029,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
>
>                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
>
> -               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
> +               for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
>                         delta = lstats->state_pending[i];
>                         if (delta)
>                                 lstats->state_pending[i] = 0;
> --
> 2.43.0
>
Shakeel Butt April 30, 2024, 5:37 p.m. UTC | #3
On Tue, Apr 30, 2024 at 01:41:38AM -0700, Yosry Ahmed wrote:
> On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
[...]
> > +
> > +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> > +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> > +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> 
> NR_MEMCG_STATS and MEMCG_NR_STAT are awfully close and have different
> meanings. I think we should come up with better names (sorry nothing
> comes to mind) or add a comment to make the difference more obvious.
> 

How about the following comment?

/*
 * Please note that NR_MEMCG_STATS represents the number of memcg stats
 * we store in memory while MEMCG_NR_STAT represents the max enum value
 * of the memcg stats.
 */

> > +
> > +static void init_memcg_stats(void)
> > +{
> > +       int8_t i, j = 0;
> > +
> > +       /* Switch to short once this failure occurs. */
> > +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> 
> Should we use S8_MAX here too?
> 

Yes. Andrew, can you please add the above comment and replacement of
127 with S8_MAX in the patch?

[...]
> >
> > -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > -       x = READ_ONCE(pn->lruvec_stats->state[idx]);
> > +       i = memcg_stats_index(idx);
> > +       if (i >= 0) {
> 
> nit: we could return here if (i < 0) like you did in
> memcg_page_state() and others below, less indentation. Same for
> lruvec_page_state_local().
> 

I have fixed this in the following patch which adds warnings.


Thanks for the reviews.
Yosry Ahmed April 30, 2024, 5:41 p.m. UTC | #4
On Tue, Apr 30, 2024 at 10:38 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Apr 30, 2024 at 01:41:38AM -0700, Yosry Ahmed wrote:
> > On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> [...]
> > > +
> > > +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> > > +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> > > +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> >
> > NR_MEMCG_STATS and MEMCG_NR_STAT are awfully close and have different
> > meanings. I think we should come up with better names (sorry nothing
> > comes to mind) or add a comment to make the difference more obvious.
> >
>
> How about the following comment?

The comment LGTM. I prefer renaming them though if someone can come up
with better names.

>
> /*
>  * Please note that NR_MEMCG_STATS represents the number of memcg stats
>  * we store in memory while MEMCG_NR_STAT represents the max enum value
>  * of the memcg stats.
>  */
>
> > > +
> > > +static void init_memcg_stats(void)
> > > +{
> > > +       int8_t i, j = 0;
> > > +
> > > +       /* Switch to short once this failure occurs. */
> > > +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> >
> > Should we use S8_MAX here too?
> >
>
> Yes. Andrew, can you please add the above comment and replacement of
> 127 with S8_MAX in the patch?
>
> [...]
> > >
> > > -       pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
> > > -       x = READ_ONCE(pn->lruvec_stats->state[idx]);
> > > +       i = memcg_stats_index(idx);
> > > +       if (i >= 0) {
> >
> > nit: we could return here if (i < 0) like you did in
> > memcg_page_state() and others below, less indentation. Same for
> > lruvec_page_state_local().
> >
>
> I have fixed this in the following patch which adds warnings.

Yeah I saw that after reviewing this one.

FWIW, *if* you respin this, fixing this here would reduce the diff
noise in the patch that adds the warnings.
Shakeel Butt April 30, 2024, 5:45 p.m. UTC | #5
On Tue, Apr 30, 2024 at 10:30:51AM -0700, T.J. Mercier wrote:
> On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > +
> > +static inline int memcg_stats_index(int idx)
> > +{
> > +       return mem_cgroup_stats_index[idx] - 1;
> 
> Could this just be: return mem_cgroup_stats_index[idx];
> with a postfix increment of j in init_memcg_stats instead of prefix increment?
> 

The -1 is basically for error checking but I will do a followup patch to
initialize the array/indirection-table with -1 and remove the
subtraction from the fast path.

Thanks for the review.
Shakeel Butt April 30, 2024, 5:49 p.m. UTC | #6
On Tue, Apr 30, 2024 at 10:41:01AM -0700, Yosry Ahmed wrote:
> On Tue, Apr 30, 2024 at 10:38 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Tue, Apr 30, 2024 at 01:41:38AM -0700, Yosry Ahmed wrote:
> > > On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
[...]
> > >
> > > nit: we could return here if (i < 0) like you did in
> > > memcg_page_state() and others below, less indentation. Same for
> > > lruvec_page_state_local().
> > >
> >
> > I have fixed this in the following patch which adds warnings.
> 
> Yeah I saw that after reviewing this one.
> 
> FWIW, *if* you respin this, fixing this here would reduce the diff
> noise in the patch that adds the warnings.

Yeah, if I need to respin, I will change this.
Shakeel Butt April 30, 2024, 11 p.m. UTC | #7
On Tue, Apr 30, 2024 at 10:41:01AM -0700, Yosry Ahmed wrote:
> On Tue, Apr 30, 2024 at 10:38 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Tue, Apr 30, 2024 at 01:41:38AM -0700, Yosry Ahmed wrote:
> > > On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > >
> > [...]
> > > > +
> > > > +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> > > > +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> > > > +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> > >
> > > NR_MEMCG_STATS and MEMCG_NR_STAT are awfully close and have different
> > > meanings. I think we should come up with better names (sorry nothing
> > > comes to mind) or add a comment to make the difference more obvious.
> > >
> >
> > How about the following comment?
> 
> The comment LGTM. I prefer renaming them though if someone can come up
> with better names.
> 

I will be posting v4 and will change the name (still thinking about the
name) becasuse:

> > > > +static void init_memcg_stats(void)
> > > > +{
> > > > +       int8_t i, j = 0;
> > > > +
> > > > +       /* Switch to short once this failure occurs. */
> > > > +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);

The above should be MEMCG_NR_STAT instead of NR_MEMCG_STATS.
Yosry Ahmed April 30, 2024, 11:07 p.m. UTC | #8
On Tue, Apr 30, 2024 at 4:00 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Apr 30, 2024 at 10:41:01AM -0700, Yosry Ahmed wrote:
> > On Tue, Apr 30, 2024 at 10:38 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Tue, Apr 30, 2024 at 01:41:38AM -0700, Yosry Ahmed wrote:
> > > > On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > > >
> > > [...]
> > > > > +
> > > > > +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> > > > > +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> > > > > +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> > > >
> > > > NR_MEMCG_STATS and MEMCG_NR_STAT are awfully close and have different
> > > > meanings. I think we should come up with better names (sorry nothing
> > > > comes to mind) or add a comment to make the difference more obvious.
> > > >
> > >
> > > How about the following comment?
> >
> > The comment LGTM. I prefer renaming them though if someone can come up
> > with better names.
> >
>
> I will be posting v4 and will change the name (still thinking about the
> name) becasuse:
>
> > > > > +static void init_memcg_stats(void)
> > > > > +{
> > > > > +       int8_t i, j = 0;
> > > > > +
> > > > > +       /* Switch to short once this failure occurs. */
> > > > > +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
>
> The above should be MEMCG_NR_STAT instead of NR_MEMCG_STATS.

Yeah it's pretty confusing :)

How about something explicit like:

NR_MEMCG_POSSIBLE_STAT_ITEMS / MEMCG_MAX_STAT_ITEM
NR_MEMCG_ACTUAL_STAT_ITEMS / MEMCG_ACTUAL_NR_STAT

They look ugly, but I can't think of anything better. Maybe they will
inspire something better :)
Johannes Weiner May 1, 2024, 12:50 a.m. UTC | #9
On Tue, Apr 30, 2024 at 04:07:05PM -0700, Yosry Ahmed wrote:
> On Tue, Apr 30, 2024 at 4:00 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Tue, Apr 30, 2024 at 10:41:01AM -0700, Yosry Ahmed wrote:
> > > On Tue, Apr 30, 2024 at 10:38 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > >
> > > > On Tue, Apr 30, 2024 at 01:41:38AM -0700, Yosry Ahmed wrote:
> > > > > On Mon, Apr 29, 2024 at 11:06 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > > > > >
> > > > [...]
> > > > > > +
> > > > > > +#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
> > > > > > +#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
> > > > > > +static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
> > > > >
> > > > > NR_MEMCG_STATS and MEMCG_NR_STAT are awfully close and have different
> > > > > meanings. I think we should come up with better names (sorry nothing
> > > > > comes to mind) or add a comment to make the difference more obvious.
> > > > >
> > > >
> > > > How about the following comment?
> > >
> > > The comment LGTM. I prefer renaming them though if someone can come up
> > > with better names.
> > >
> >
> > I will be posting v4 and will change the name (still thinking about the
> > name) becasuse:
> >
> > > > > > +static void init_memcg_stats(void)
> > > > > > +{
> > > > > > +       int8_t i, j = 0;
> > > > > > +
> > > > > > +       /* Switch to short once this failure occurs. */
> > > > > > +       BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
> >
> > The above should be MEMCG_NR_STAT instead of NR_MEMCG_STATS.
> 
> Yeah it's pretty confusing :)
> 
> How about something explicit like:
> 
> NR_MEMCG_POSSIBLE_STAT_ITEMS / MEMCG_MAX_STAT_ITEM
> NR_MEMCG_ACTUAL_STAT_ITEMS / MEMCG_ACTUAL_NR_STAT

NR is pretty common to mark the end of an enum range. It would be good
to keep that for enum memcg_stat_item.

The other one is about an array, where we usually use "size" or
"len". How about using one of those instead? I think it should be
sufficiently distinguished then:

- MEMORY_STAT_LEN
- MEMCG_VMSTAT_LEN
- MEMCG_VMSTAT_SIZE
diff mbox series

Patch

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 434cff91b65e..f424c5b2ba9b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -576,35 +576,105 @@  mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 	return mz;
 }
 
+/* Subset of node_stat_item for memcg stats */
+static const unsigned int memcg_node_stat_items[] = {
+	NR_INACTIVE_ANON,
+	NR_ACTIVE_ANON,
+	NR_INACTIVE_FILE,
+	NR_ACTIVE_FILE,
+	NR_UNEVICTABLE,
+	NR_SLAB_RECLAIMABLE_B,
+	NR_SLAB_UNRECLAIMABLE_B,
+	WORKINGSET_REFAULT_ANON,
+	WORKINGSET_REFAULT_FILE,
+	WORKINGSET_ACTIVATE_ANON,
+	WORKINGSET_ACTIVATE_FILE,
+	WORKINGSET_RESTORE_ANON,
+	WORKINGSET_RESTORE_FILE,
+	WORKINGSET_NODERECLAIM,
+	NR_ANON_MAPPED,
+	NR_FILE_MAPPED,
+	NR_FILE_PAGES,
+	NR_FILE_DIRTY,
+	NR_WRITEBACK,
+	NR_SHMEM,
+	NR_SHMEM_THPS,
+	NR_FILE_THPS,
+	NR_ANON_THPS,
+	NR_KERNEL_STACK_KB,
+	NR_PAGETABLE,
+	NR_SECONDARY_PAGETABLE,
+#ifdef CONFIG_SWAP
+	NR_SWAPCACHE,
+#endif
+};
+
+static const unsigned int memcg_stat_items[] = {
+	MEMCG_SWAP,
+	MEMCG_SOCK,
+	MEMCG_PERCPU_B,
+	MEMCG_VMALLOC,
+	MEMCG_KMEM,
+	MEMCG_ZSWAP_B,
+	MEMCG_ZSWAPPED,
+};
+
+#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
+#define NR_MEMCG_STATS (NR_MEMCG_NODE_STAT_ITEMS + ARRAY_SIZE(memcg_stat_items))
+static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
+
+static void init_memcg_stats(void)
+{
+	int8_t i, j = 0;
+
+	/* Switch to short once this failure occurs. */
+	BUILD_BUG_ON(NR_MEMCG_STATS >= 127 /* INT8_MAX */);
+
+	for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
+		mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
+
+	for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
+		mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
+}
+
+static inline int memcg_stats_index(int idx)
+{
+	return mem_cgroup_stats_index[idx] - 1;
+}
+
 struct lruvec_stats_percpu {
 	/* Local (CPU and cgroup) state */
-	long state[NR_VM_NODE_STAT_ITEMS];
+	long state[NR_MEMCG_NODE_STAT_ITEMS];
 
 	/* Delta calculation for lockless upward propagation */
-	long state_prev[NR_VM_NODE_STAT_ITEMS];
+	long state_prev[NR_MEMCG_NODE_STAT_ITEMS];
 };
 
 struct lruvec_stats {
 	/* Aggregated (CPU and subtree) state */
-	long state[NR_VM_NODE_STAT_ITEMS];
+	long state[NR_MEMCG_NODE_STAT_ITEMS];
 
 	/* Non-hierarchical (CPU aggregated) state */
-	long state_local[NR_VM_NODE_STAT_ITEMS];
+	long state_local[NR_MEMCG_NODE_STAT_ITEMS];
 
 	/* Pending child counts during tree propagation */
-	long state_pending[NR_VM_NODE_STAT_ITEMS];
+	long state_pending[NR_MEMCG_NODE_STAT_ITEMS];
 };
 
 unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
 {
 	struct mem_cgroup_per_node *pn;
-	long x;
+	long x = 0;
+	int i;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats->state[idx]);
+	i = memcg_stats_index(idx);
+	if (i >= 0) {
+		pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+		x = READ_ONCE(pn->lruvec_stats->state[i]);
+	}
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -617,12 +687,16 @@  unsigned long lruvec_page_state_local(struct lruvec *lruvec,
 {
 	struct mem_cgroup_per_node *pn;
 	long x = 0;
+	int i;
 
 	if (mem_cgroup_disabled())
 		return node_page_state(lruvec_pgdat(lruvec), idx);
 
-	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-	x = READ_ONCE(pn->lruvec_stats->state_local[idx]);
+	i = memcg_stats_index(idx);
+	if (i >= 0) {
+		pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+		x = READ_ONCE(pn->lruvec_stats->state_local[i]);
+	}
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -689,11 +763,11 @@  struct memcg_vmstats_percpu {
 	/* The above should fit a single cacheline for memcg_rstat_updated() */
 
 	/* Local (CPU and cgroup) page state & events */
-	long			state[MEMCG_NR_STAT];
+	long			state[NR_MEMCG_STATS];
 	unsigned long		events[NR_MEMCG_EVENTS];
 
 	/* Delta calculation for lockless upward propagation */
-	long			state_prev[MEMCG_NR_STAT];
+	long			state_prev[NR_MEMCG_STATS];
 	unsigned long		events_prev[NR_MEMCG_EVENTS];
 
 	/* Cgroup1: threshold notifications & softlimit tree updates */
@@ -703,15 +777,15 @@  struct memcg_vmstats_percpu {
 
 struct memcg_vmstats {
 	/* Aggregated (CPU and subtree) page state & events */
-	long			state[MEMCG_NR_STAT];
+	long			state[NR_MEMCG_STATS];
 	unsigned long		events[NR_MEMCG_EVENTS];
 
 	/* Non-hierarchical (CPU aggregated) page state & events */
-	long			state_local[MEMCG_NR_STAT];
+	long			state_local[NR_MEMCG_STATS];
 	unsigned long		events_local[NR_MEMCG_EVENTS];
 
 	/* Pending child counts during tree propagation */
-	long			state_pending[MEMCG_NR_STAT];
+	long			state_pending[NR_MEMCG_STATS];
 	unsigned long		events_pending[NR_MEMCG_EVENTS];
 
 	/* Stats updates since the last flush */
@@ -844,7 +918,13 @@  static void flush_memcg_stats_dwork(struct work_struct *w)
 
 unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
 {
-	long x = READ_ONCE(memcg->vmstats->state[idx]);
+	long x;
+	int i = memcg_stats_index(idx);
+
+	if (i < 0)
+		return 0;
+
+	x = READ_ONCE(memcg->vmstats->state[i]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -876,18 +956,25 @@  static int memcg_state_val_in_pages(int idx, int val)
  */
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 {
-	if (mem_cgroup_disabled())
+	int i = memcg_stats_index(idx);
+
+	if (mem_cgroup_disabled() || i < 0)
 		return;
 
-	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
 	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
 }
 
 /* idx can be of type enum memcg_stat_item or node_stat_item. */
 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 {
-	long x = READ_ONCE(memcg->vmstats->state_local[idx]);
+	long x;
+	int i = memcg_stats_index(idx);
+
+	if (i < 0)
+		return 0;
 
+	x = READ_ONCE(memcg->vmstats->state_local[i]);
 #ifdef CONFIG_SMP
 	if (x < 0)
 		x = 0;
@@ -901,6 +988,10 @@  static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 {
 	struct mem_cgroup_per_node *pn;
 	struct mem_cgroup *memcg;
+	int i = memcg_stats_index(idx);
+
+	if (i < 0)
+		return;
 
 	pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 	memcg = pn->memcg;
@@ -930,10 +1021,10 @@  static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
 	}
 
 	/* Update memcg */
-	__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+	__this_cpu_add(memcg->vmstats_percpu->state[i], val);
 
 	/* Update lruvec */
-	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+	__this_cpu_add(pn->lruvec_stats_percpu->state[i], val);
 
 	memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
 	memcg_stats_unlock();
@@ -5702,6 +5793,7 @@  mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
 	} else {
+		init_memcg_stats();
 		init_memcg_events();
 		page_counter_init(&memcg->memory, NULL);
 		page_counter_init(&memcg->swap, NULL);
@@ -5873,7 +5965,7 @@  static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 
 	statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
 
-	for (i = 0; i < MEMCG_NR_STAT; i++) {
+	for (i = 0; i < NR_MEMCG_STATS; i++) {
 		/*
 		 * Collect the aggregated propagation counts of groups
 		 * below us. We're in a per-cpu loop here and this is
@@ -5937,7 +6029,7 @@  static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 
 		lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
 
-		for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+		for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) {
 			delta = lstats->state_pending[i];
 			if (delta)
 				lstats->state_pending[i] = 0;