Message ID | 20200608230654.828134-3-guro@fb.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | The new cgroup slab memory controller | expand |
On Mon, Jun 8, 2020 at 4:07 PM Roman Gushchin <guro@fb.com> wrote: > > To implement per-object slab memory accounting, we need to > convert slab vmstat counters to bytes. Actually, out of > 4 levels of counters: global, per-node, per-memcg and per-lruvec > only two last levels will require byte-sized counters. > It's because global and per-node counters will be counting the > number of slab pages, and per-memcg and per-lruvec will be > counting the amount of memory taken by charged slab objects. > > Converting all vmstat counters to bytes or even all slab > counters to bytes would introduce an additional overhead. > So instead let's store global and per-node counters > in pages, and memcg and lruvec counters in bytes. > > To make the API clean all access helpers (both on the read > and write sides) are dealing with bytes. > The "dealing with bytes" is only for slab stats or all vmstat stats? > To avoid back-and-forth conversions a new flavor of read-side > helpers is introduced, which always returns values in pages: > node_page_state_pages() and global_node_page_state_pages(). > > Actually new helpers are just reading raw values. Old helpers are > simple wrappers, which will complain on an attempt to read > byte value, because at the moment no one actually needs bytes. > > Thanks to Johannes Weiner for the idea of having the byte-sized API > on top of the page-sized internal storage. > > Signed-off-by: Roman Gushchin <guro@fb.com> > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > --- > drivers/base/node.c | 2 +- > include/linux/mmzone.h | 10 ++++++++++ > include/linux/vmstat.h | 14 +++++++++++++- > mm/memcontrol.c | 14 ++++++++++---- > mm/vmstat.c | 30 ++++++++++++++++++++++++++---- > 5 files changed, 60 insertions(+), 10 deletions(-) > > diff --git a/drivers/base/node.c b/drivers/base/node.c > index 5b02f69769e8..e21e31359297 100644 > --- a/drivers/base/node.c > +++ b/drivers/base/node.c > @@ -513,7 +513,7 @@ static ssize_t node_read_vmstat(struct device *dev, > > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > n += sprintf(buf+n, "%s %lu\n", node_stat_name(i), > - node_page_state(pgdat, i)); > + node_page_state_pages(pgdat, i)); > > return n; > } > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index c4c37fd12104..fa8eb49d9898 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -206,6 +206,16 @@ enum node_stat_item { > NR_VM_NODE_STAT_ITEMS > }; > > +/* > + * Returns true if the value is measured in bytes (most vmstat values are > + * measured in pages). This defines the API part, the internal representation > + * might be different. > + */ > +static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) > +{ > + return false; > +} > + > /* > * We do arithmetic on the LRU lists in various places in the code, > * so it is important to keep the active lists LRU_ACTIVE higher in > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > index aa961088c551..91220ace31da 100644 > --- a/include/linux/vmstat.h > +++ b/include/linux/vmstat.h > @@ -8,6 +8,7 @@ > #include <linux/vm_event_item.h> > #include <linux/atomic.h> > #include <linux/static_key.h> > +#include <linux/mmdebug.h> > > extern int sysctl_stat_interval; > > @@ -192,7 +193,8 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item) > return x; > } > > -static inline unsigned long global_node_page_state(enum node_stat_item item) > +static inline > +unsigned long global_node_page_state_pages(enum node_stat_item item) > { > long x = atomic_long_read(&vm_node_stat[item]); > #ifdef CONFIG_SMP > @@ -202,6 +204,13 @@ static inline unsigned long global_node_page_state(enum node_stat_item item) > return x; > } > > +static inline unsigned long global_node_page_state(enum node_stat_item item) > +{ > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > + > + return global_node_page_state_pages(item); > +} > + > static inline unsigned long zone_page_state(struct zone *zone, > enum zone_stat_item item) > { > @@ -242,9 +251,12 @@ extern unsigned long sum_zone_node_page_state(int node, > extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); > extern unsigned long node_page_state(struct pglist_data *pgdat, > enum node_stat_item item); > +extern unsigned long node_page_state_pages(struct pglist_data *pgdat, > + enum node_stat_item item); > #else > #define sum_zone_node_page_state(node, item) global_zone_page_state(item) > #define node_page_state(node, item) global_node_page_state(item) > +#define node_page_state_pages(node, item) global_node_page_state_pages(item) > #endif /* CONFIG_NUMA */ > > #ifdef CONFIG_SMP > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e8a91e98556b..07d02e61a73e 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -681,13 +681,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > */ > void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) > { > - long x; > + long x, threshold = MEMCG_CHARGE_BATCH; > > if (mem_cgroup_disabled()) > return; > > + if (vmstat_item_in_bytes(idx)) > + threshold <<= PAGE_SHIFT; > + From the above am I understanding correctly that even after moving to byte-level accounting, we can still see stats with potential error limited by (BATCH-1)*PAGE_SIZE*nr_cpus? > x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); > - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { > + if (unlikely(abs(x) > threshold)) { > struct mem_cgroup *mi; > > /* > @@ -718,7 +721,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, > { > struct mem_cgroup_per_node *pn; > struct mem_cgroup *memcg; > - long x; > + long x, threshold = MEMCG_CHARGE_BATCH; > > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > memcg = pn->memcg; > @@ -729,8 +732,11 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, > /* Update lruvec */ > __this_cpu_add(pn->lruvec_stat_local->count[idx], val); > > + if (vmstat_item_in_bytes(idx)) > + threshold <<= PAGE_SHIFT; > + > x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); > - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { > + if (unlikely(abs(x) > threshold)) { > pg_data_t *pgdat = lruvec_pgdat(lruvec); > struct mem_cgroup_per_node *pi; > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 80c9b6221535..f1c321e1d6d3 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, > long x; > long t; > > + if (vmstat_item_in_bytes(item)) { > + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > + delta >>= PAGE_SHIFT; > + } > + > x = delta + __this_cpu_read(*p); > > t = __this_cpu_read(pcp->stat_threshold); > @@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) > s8 __percpu *p = pcp->vm_node_stat_diff + item; > s8 v, t; > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > + > v = __this_cpu_inc_return(*p); > t = __this_cpu_read(pcp->stat_threshold); > if (unlikely(v > t)) { > @@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) > s8 __percpu *p = pcp->vm_node_stat_diff + item; > s8 v, t; > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > + > v = __this_cpu_dec_return(*p); > t = __this_cpu_read(pcp->stat_threshold); > if (unlikely(v < - t)) { > @@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat, > s8 __percpu *p = pcp->vm_node_stat_diff + item; > long o, n, t, z; > > + if (vmstat_item_in_bytes(item)) { > + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > + delta >>= PAGE_SHIFT; > + } > + > do { > z = 0; /* overflow to node counters */ > > @@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node, > /* > * Determine the per node value of a stat item. > */ > -unsigned long node_page_state(struct pglist_data *pgdat, > - enum node_stat_item item) > +unsigned long node_page_state_pages(struct pglist_data *pgdat, > + enum node_stat_item item) > { > long x = atomic_long_read(&pgdat->vm_stat[item]); > #ifdef CONFIG_SMP > @@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat, > #endif > return x; > } > + > +unsigned long node_page_state(struct pglist_data *pgdat, > + enum node_stat_item item) > +{ > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > + > + return node_page_state_pages(pgdat, item); > +} So, for non-slab, node_page_state and node_page_state_pages will be the same but different for slab vmstats. However we should not be calling node_page_state with slab vmstats because we don't need it, right? > #endif > > #ifdef CONFIG_COMPACTION > @@ -1581,7 +1603,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > seq_printf(m, "\n per-node stats"); > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { > seq_printf(m, "\n %-12s %lu", node_stat_name(i), > - node_page_state(pgdat, i)); > + node_page_state_pages(pgdat, i)); > } > } > seq_printf(m, > @@ -1702,7 +1724,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) > #endif > > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > - v[i] = global_node_page_state(i); > + v[i] = global_node_page_state_pages(i); > v += NR_VM_NODE_STAT_ITEMS; > > global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, > -- > 2.25.4 >
On Tue, Jun 16, 2020 at 07:57:54PM -0700, Shakeel Butt wrote: > On Mon, Jun 8, 2020 at 4:07 PM Roman Gushchin <guro@fb.com> wrote: > > > > To implement per-object slab memory accounting, we need to > > convert slab vmstat counters to bytes. Actually, out of > > 4 levels of counters: global, per-node, per-memcg and per-lruvec > > only two last levels will require byte-sized counters. > > It's because global and per-node counters will be counting the > > number of slab pages, and per-memcg and per-lruvec will be > > counting the amount of memory taken by charged slab objects. > > > > Converting all vmstat counters to bytes or even all slab > > counters to bytes would introduce an additional overhead. > > So instead let's store global and per-node counters > > in pages, and memcg and lruvec counters in bytes. > > > > To make the API clean all access helpers (both on the read > > and write sides) are dealing with bytes. > > > > The "dealing with bytes" is only for slab stats or all vmstat stats? Only slab stats as now. I've sent a percpu memory accounting patchset separately, which will add another byte-sized counter. > > > To avoid back-and-forth conversions a new flavor of read-side > > helpers is introduced, which always returns values in pages: > > node_page_state_pages() and global_node_page_state_pages(). > > > > Actually new helpers are just reading raw values. Old helpers are > > simple wrappers, which will complain on an attempt to read > > byte value, because at the moment no one actually needs bytes. > > > > Thanks to Johannes Weiner for the idea of having the byte-sized API > > on top of the page-sized internal storage. > > > > Signed-off-by: Roman Gushchin <guro@fb.com> > > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> > > --- > > drivers/base/node.c | 2 +- > > include/linux/mmzone.h | 10 ++++++++++ > > include/linux/vmstat.h | 14 +++++++++++++- > > mm/memcontrol.c | 14 ++++++++++---- > > mm/vmstat.c | 30 ++++++++++++++++++++++++++---- > > 5 files changed, 60 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/base/node.c b/drivers/base/node.c > > index 5b02f69769e8..e21e31359297 100644 > > --- a/drivers/base/node.c > > +++ b/drivers/base/node.c > > @@ -513,7 +513,7 @@ static ssize_t node_read_vmstat(struct device *dev, > > > > for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) > > n += sprintf(buf+n, "%s %lu\n", node_stat_name(i), > > - node_page_state(pgdat, i)); > > + node_page_state_pages(pgdat, i)); > > > > return n; > > } > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > > index c4c37fd12104..fa8eb49d9898 100644 > > --- a/include/linux/mmzone.h > > +++ b/include/linux/mmzone.h > > @@ -206,6 +206,16 @@ enum node_stat_item { > > NR_VM_NODE_STAT_ITEMS > > }; > > > > +/* > > + * Returns true if the value is measured in bytes (most vmstat values are > > + * measured in pages). This defines the API part, the internal representation > > + * might be different. > > + */ > > +static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) > > +{ > > + return false; > > +} > > + > > /* > > * We do arithmetic on the LRU lists in various places in the code, > > * so it is important to keep the active lists LRU_ACTIVE higher in > > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > > index aa961088c551..91220ace31da 100644 > > --- a/include/linux/vmstat.h > > +++ b/include/linux/vmstat.h > > @@ -8,6 +8,7 @@ > > #include <linux/vm_event_item.h> > > #include <linux/atomic.h> > > #include <linux/static_key.h> > > +#include <linux/mmdebug.h> > > > > extern int sysctl_stat_interval; > > > > @@ -192,7 +193,8 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item) > > return x; > > } > > > > -static inline unsigned long global_node_page_state(enum node_stat_item item) > > +static inline > > +unsigned long global_node_page_state_pages(enum node_stat_item item) > > { > > long x = atomic_long_read(&vm_node_stat[item]); > > #ifdef CONFIG_SMP > > @@ -202,6 +204,13 @@ static inline unsigned long global_node_page_state(enum node_stat_item item) > > return x; > > } > > > > +static inline unsigned long global_node_page_state(enum node_stat_item item) > > +{ > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > + return global_node_page_state_pages(item); > > +} > > + > > static inline unsigned long zone_page_state(struct zone *zone, > > enum zone_stat_item item) > > { > > @@ -242,9 +251,12 @@ extern unsigned long sum_zone_node_page_state(int node, > > extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); > > extern unsigned long node_page_state(struct pglist_data *pgdat, > > enum node_stat_item item); > > +extern unsigned long node_page_state_pages(struct pglist_data *pgdat, > > + enum node_stat_item item); > > #else > > #define sum_zone_node_page_state(node, item) global_zone_page_state(item) > > #define node_page_state(node, item) global_node_page_state(item) > > +#define node_page_state_pages(node, item) global_node_page_state_pages(item) > > #endif /* CONFIG_NUMA */ > > > > #ifdef CONFIG_SMP > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > index e8a91e98556b..07d02e61a73e 100644 > > --- a/mm/memcontrol.c > > +++ b/mm/memcontrol.c > > @@ -681,13 +681,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > > */ > > void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) > > { > > - long x; > > + long x, threshold = MEMCG_CHARGE_BATCH; > > > > if (mem_cgroup_disabled()) > > return; > > > > + if (vmstat_item_in_bytes(idx)) > > + threshold <<= PAGE_SHIFT; > > + > > From the above am I understanding correctly that even after moving to > byte-level accounting, we can still see stats with potential error > limited by (BATCH-1)*PAGE_SIZE*nr_cpus? > > > x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); > > - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { > > + if (unlikely(abs(x) > threshold)) { > > struct mem_cgroup *mi; > > > > /* > > @@ -718,7 +721,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, > > { > > struct mem_cgroup_per_node *pn; > > struct mem_cgroup *memcg; > > - long x; > > + long x, threshold = MEMCG_CHARGE_BATCH; > > > > pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > > memcg = pn->memcg; > > @@ -729,8 +732,11 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, > > /* Update lruvec */ > > __this_cpu_add(pn->lruvec_stat_local->count[idx], val); > > > > + if (vmstat_item_in_bytes(idx)) > > + threshold <<= PAGE_SHIFT; > > + > > x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); > > - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { > > + if (unlikely(abs(x) > threshold)) { > > pg_data_t *pgdat = lruvec_pgdat(lruvec); > > struct mem_cgroup_per_node *pi; > > > > diff --git a/mm/vmstat.c b/mm/vmstat.c > > index 80c9b6221535..f1c321e1d6d3 100644 > > --- a/mm/vmstat.c > > +++ b/mm/vmstat.c > > @@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, > > long x; > > long t; > > > > + if (vmstat_item_in_bytes(item)) { > > + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > + delta >>= PAGE_SHIFT; > > + } > > + > > x = delta + __this_cpu_read(*p); > > > > t = __this_cpu_read(pcp->stat_threshold); > > @@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > s8 v, t; > > > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > v = __this_cpu_inc_return(*p); > > t = __this_cpu_read(pcp->stat_threshold); > > if (unlikely(v > t)) { > > @@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > s8 v, t; > > > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > v = __this_cpu_dec_return(*p); > > t = __this_cpu_read(pcp->stat_threshold); > > if (unlikely(v < - t)) { > > @@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat, > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > long o, n, t, z; > > > > + if (vmstat_item_in_bytes(item)) { > > + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > + delta >>= PAGE_SHIFT; > > + } > > + > > do { > > z = 0; /* overflow to node counters */ > > > > @@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node, > > /* > > * Determine the per node value of a stat item. > > */ > > -unsigned long node_page_state(struct pglist_data *pgdat, > > - enum node_stat_item item) > > +unsigned long node_page_state_pages(struct pglist_data *pgdat, > > + enum node_stat_item item) > > { > > long x = atomic_long_read(&pgdat->vm_stat[item]); > > #ifdef CONFIG_SMP > > @@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat, > > #endif > > return x; > > } > > + > > +unsigned long node_page_state(struct pglist_data *pgdat, > > + enum node_stat_item item) > > +{ > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + > > + return node_page_state_pages(pgdat, item); > > +} > > So, for non-slab, node_page_state and node_page_state_pages will be > the same but different for slab vmstats. However we should not be > calling node_page_state with slab vmstats because we don't need it, > right? Right.
On Mon, Jun 8, 2020 at 4:07 PM Roman Gushchin <guro@fb.com> wrote: > > To implement per-object slab memory accounting, we need to > convert slab vmstat counters to bytes. Actually, out of > 4 levels of counters: global, per-node, per-memcg and per-lruvec > only two last levels will require byte-sized counters. > It's because global and per-node counters will be counting the > number of slab pages, and per-memcg and per-lruvec will be > counting the amount of memory taken by charged slab objects. > > Converting all vmstat counters to bytes or even all slab > counters to bytes would introduce an additional overhead. > So instead let's store global and per-node counters > in pages, and memcg and lruvec counters in bytes. > > To make the API clean all access helpers (both on the read > and write sides) are dealing with bytes. > > To avoid back-and-forth conversions a new flavor of read-side > helpers is introduced, which always returns values in pages: > node_page_state_pages() and global_node_page_state_pages(). > > Actually new helpers are just reading raw values. Old helpers are > simple wrappers, which will complain on an attempt to read > byte value, because at the moment no one actually needs bytes. > > Thanks to Johannes Weiner for the idea of having the byte-sized API > on top of the page-sized internal storage. > > Signed-off-by: Roman Gushchin <guro@fb.com> > Acked-by: Johannes Weiner <hannes@cmpxchg.org> > Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Reviewed-by: Shakeel Butt <shakeelb@google.com>
diff --git a/drivers/base/node.c b/drivers/base/node.c index 5b02f69769e8..e21e31359297 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -513,7 +513,7 @@ static ssize_t node_read_vmstat(struct device *dev, for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) n += sprintf(buf+n, "%s %lu\n", node_stat_name(i), - node_page_state(pgdat, i)); + node_page_state_pages(pgdat, i)); return n; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c4c37fd12104..fa8eb49d9898 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -206,6 +206,16 @@ enum node_stat_item { NR_VM_NODE_STAT_ITEMS }; +/* + * Returns true if the value is measured in bytes (most vmstat values are + * measured in pages). This defines the API part, the internal representation + * might be different. + */ +static __always_inline bool vmstat_item_in_bytes(enum node_stat_item item) +{ + return false; +} + /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index aa961088c551..91220ace31da 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -8,6 +8,7 @@ #include <linux/vm_event_item.h> #include <linux/atomic.h> #include <linux/static_key.h> +#include <linux/mmdebug.h> extern int sysctl_stat_interval; @@ -192,7 +193,8 @@ static inline unsigned long global_zone_page_state(enum zone_stat_item item) return x; } -static inline unsigned long global_node_page_state(enum node_stat_item item) +static inline +unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP @@ -202,6 +204,13 @@ static inline unsigned long global_node_page_state(enum node_stat_item item) return x; } +static inline unsigned long global_node_page_state(enum node_stat_item item) +{ + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + return global_node_page_state_pages(item); +} + static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { @@ -242,9 +251,12 @@ extern unsigned long sum_zone_node_page_state(int node, extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); +extern unsigned long node_page_state_pages(struct pglist_data *pgdat, + enum node_stat_item item); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) +#define node_page_state_pages(node, item) global_node_page_state_pages(item) #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8a91e98556b..07d02e61a73e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -681,13 +681,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) */ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - long x; + long x, threshold = MEMCG_CHARGE_BATCH; if (mem_cgroup_disabled()) return; + if (vmstat_item_in_bytes(idx)) + threshold <<= PAGE_SHIFT; + x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + if (unlikely(abs(x) > threshold)) { struct mem_cgroup *mi; /* @@ -718,7 +721,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, { struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; - long x; + long x, threshold = MEMCG_CHARGE_BATCH; pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; @@ -729,8 +732,11 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update lruvec */ __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + if (vmstat_item_in_bytes(idx)) + threshold <<= PAGE_SHIFT; + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); - if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + if (unlikely(abs(x) > threshold)) { pg_data_t *pgdat = lruvec_pgdat(lruvec); struct mem_cgroup_per_node *pi; diff --git a/mm/vmstat.c b/mm/vmstat.c index 80c9b6221535..f1c321e1d6d3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long x; long t; + if (vmstat_item_in_bytes(item)) { + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) s8 __percpu *p = pcp->vm_node_stat_diff + item; s8 v, t; + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + v = __this_cpu_inc_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v > t)) { @@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) s8 __percpu *p = pcp->vm_node_stat_diff + item; s8 v, t; + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + v = __this_cpu_dec_return(*p); t = __this_cpu_read(pcp->stat_threshold); if (unlikely(v < - t)) { @@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat, s8 __percpu *p = pcp->vm_node_stat_diff + item; long o, n, t, z; + if (vmstat_item_in_bytes(item)) { + VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); + delta >>= PAGE_SHIFT; + } + do { z = 0; /* overflow to node counters */ @@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node, /* * Determine the per node value of a stat item. */ -unsigned long node_page_state(struct pglist_data *pgdat, - enum node_stat_item item) +unsigned long node_page_state_pages(struct pglist_data *pgdat, + enum node_stat_item item) { long x = atomic_long_read(&pgdat->vm_stat[item]); #ifdef CONFIG_SMP @@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat, #endif return x; } + +unsigned long node_page_state(struct pglist_data *pgdat, + enum node_stat_item item) +{ + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + + return node_page_state_pages(pgdat, item); +} #endif #ifdef CONFIG_COMPACTION @@ -1581,7 +1603,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n per-node stats"); for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { seq_printf(m, "\n %-12s %lu", node_stat_name(i), - node_page_state(pgdat, i)); + node_page_state_pages(pgdat, i)); } } seq_printf(m, @@ -1702,7 +1724,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos) #endif for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) - v[i] = global_node_page_state(i); + v[i] = global_node_page_state_pages(i); v += NR_VM_NODE_STAT_ITEMS; global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,