Message ID | 20201206101451.14706-10-songmuchun@bytedance.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Convert all vmstat counters to pages or bytes | expand |
On Sun, Dec 06, 2020 at 06:14:48PM +0800, Muchun Song wrote: > the global and per-node counters are stored in pages, however memcg > and lruvec counters are stored in bytes. This scheme looks weird. > So convert all vmstat slab counters to bytes. There is a reason for this weird scheme: percpu caches (see struct per_cpu_nodestat) are s8, so counting in bytes will lead to overfills. Switching to s32 can lead to an increase in the cache thrashing, especially on small machines. > > Signed-off-by: Muchun Song <songmuchun@bytedance.com> > --- > include/linux/vmstat.h | 17 ++++++++++------- > mm/vmstat.c | 21 ++++++++++----------- > 2 files changed, 20 insertions(+), 18 deletions(-) > > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > index 322dcbfcc933..fd1a3d5d4926 100644 > --- a/include/linux/vmstat.h > +++ b/include/linux/vmstat.h > @@ -197,18 +197,26 @@ static inline > unsigned long global_node_page_state_pages(enum node_stat_item item) > { > long x = atomic_long_read(&vm_node_stat[item]); > + > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > #endif > + if (vmstat_item_in_bytes(item)) > + x >>= PAGE_SHIFT; > return x; > } > > static inline unsigned long global_node_page_state(enum node_stat_item item) > { > - VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > + long x = atomic_long_read(&vm_node_stat[item]); > > - return global_node_page_state_pages(item); > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > +#ifdef CONFIG_SMP > + if (x < 0) > + x = 0; > +#endif > + return x; > } > > static inline unsigned long zone_page_state(struct zone *zone, > @@ -312,11 +320,6 @@ static inline void __mod_zone_page_state(struct zone *zone, > static inline void __mod_node_page_state(struct pglist_data *pgdat, > enum node_stat_item item, int delta) > { > - if (vmstat_item_in_bytes(item)) { > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > - delta >>= PAGE_SHIFT; > - } > - > node_page_state_add(delta, pgdat, item); > } > > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 8d77ee426e22..7fb0c7cb9516 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -345,11 +345,6 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, > long x; > long t; > > - if (vmstat_item_in_bytes(item)) { > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > - delta >>= PAGE_SHIFT; > - } > - > x = delta + __this_cpu_read(*p); > > t = __this_cpu_read(pcp->stat_threshold); > @@ -554,11 +549,6 @@ static inline void mod_node_state(struct pglist_data *pgdat, > s8 __percpu *p = pcp->vm_node_stat_diff + item; > long o, n, t, z; > > - if (vmstat_item_in_bytes(item)) { > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > - delta >>= PAGE_SHIFT; > - } > - > do { > z = 0; /* overflow to node counters */ > > @@ -1012,19 +1002,28 @@ unsigned long node_page_state_pages(struct pglist_data *pgdat, > enum node_stat_item item) > { > long x = atomic_long_read(&pgdat->vm_stat[item]); > + > #ifdef CONFIG_SMP > if (x < 0) > x = 0; > #endif > + if (vmstat_item_in_bytes(item)) > + x >>= PAGE_SHIFT; > return x; > } > > unsigned long node_page_state(struct pglist_data *pgdat, > enum node_stat_item item) > { > + long x = atomic_long_read(&pgdat->vm_stat[item]); > + > VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > - return node_page_state_pages(pgdat, item); > +#ifdef CONFIG_SMP > + if (x < 0) > + x = 0; > +#endif > + return x; > } > #endif > > -- > 2.11.0 >
On Mon, Dec 07, 2020 at 11:46:22AM -0800, Roman Gushchin wrote: > On Sun, Dec 06, 2020 at 06:14:48PM +0800, Muchun Song wrote: > > the global and per-node counters are stored in pages, however memcg > > and lruvec counters are stored in bytes. This scheme looks weird. > > So convert all vmstat slab counters to bytes. > > There is a reason for this weird scheme: > percpu caches (see struct per_cpu_nodestat) are s8, so counting in bytes > will lead to overfills. Switching to s32 can lead to an increase in > the cache thrashing, especially on small machines. JFYI: I've tried to convert all slab counters to bytes and change those s8 percpu batches to s32 about a year ago. Here is a link to that thread: https://patchwork.kernel.org/project/linux-mm/patch/20191018002820.307763-3-guro@fb.com/ Thanks!
On Tue, Dec 8, 2020 at 3:46 AM Roman Gushchin <guro@fb.com> wrote: > > On Sun, Dec 06, 2020 at 06:14:48PM +0800, Muchun Song wrote: > > the global and per-node counters are stored in pages, however memcg > > and lruvec counters are stored in bytes. This scheme looks weird. > > So convert all vmstat slab counters to bytes. > > There is a reason for this weird scheme: > percpu caches (see struct per_cpu_nodestat) are s8, so counting in bytes > will lead to overfills. Switching to s32 can lead to an increase in > the cache thrashing, especially on small machines. Thanks Roman. I see now. > > > > > Signed-off-by: Muchun Song <songmuchun@bytedance.com> > > --- > > include/linux/vmstat.h | 17 ++++++++++------- > > mm/vmstat.c | 21 ++++++++++----------- > > 2 files changed, 20 insertions(+), 18 deletions(-) > > > > diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h > > index 322dcbfcc933..fd1a3d5d4926 100644 > > --- a/include/linux/vmstat.h > > +++ b/include/linux/vmstat.h > > @@ -197,18 +197,26 @@ static inline > > unsigned long global_node_page_state_pages(enum node_stat_item item) > > { > > long x = atomic_long_read(&vm_node_stat[item]); > > + > > #ifdef CONFIG_SMP > > if (x < 0) > > x = 0; > > #endif > > + if (vmstat_item_in_bytes(item)) > > + x >>= PAGE_SHIFT; > > return x; > > } > > > > static inline unsigned long global_node_page_state(enum node_stat_item item) > > { > > - VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > + long x = atomic_long_read(&vm_node_stat[item]); > > > > - return global_node_page_state_pages(item); > > + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > +#ifdef CONFIG_SMP > > + if (x < 0) > > + x = 0; > > +#endif > > + return x; > > } > > > > static inline unsigned long zone_page_state(struct zone *zone, > > @@ -312,11 +320,6 @@ static inline void __mod_zone_page_state(struct zone *zone, > > static inline void __mod_node_page_state(struct pglist_data *pgdat, > > enum node_stat_item item, int delta) > > { > > - if (vmstat_item_in_bytes(item)) { > > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > - delta >>= PAGE_SHIFT; > > - } > > - > > node_page_state_add(delta, pgdat, item); > > } > > > > diff --git a/mm/vmstat.c b/mm/vmstat.c > > index 8d77ee426e22..7fb0c7cb9516 100644 > > --- a/mm/vmstat.c > > +++ b/mm/vmstat.c > > @@ -345,11 +345,6 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, > > long x; > > long t; > > > > - if (vmstat_item_in_bytes(item)) { > > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > - delta >>= PAGE_SHIFT; > > - } > > - > > x = delta + __this_cpu_read(*p); > > > > t = __this_cpu_read(pcp->stat_threshold); > > @@ -554,11 +549,6 @@ static inline void mod_node_state(struct pglist_data *pgdat, > > s8 __percpu *p = pcp->vm_node_stat_diff + item; > > long o, n, t, z; > > > > - if (vmstat_item_in_bytes(item)) { > > - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); > > - delta >>= PAGE_SHIFT; > > - } > > - > > do { > > z = 0; /* overflow to node counters */ > > > > @@ -1012,19 +1002,28 @@ unsigned long node_page_state_pages(struct pglist_data *pgdat, > > enum node_stat_item item) > > { > > long x = atomic_long_read(&pgdat->vm_stat[item]); > > + > > #ifdef CONFIG_SMP > > if (x < 0) > > x = 0; > > #endif > > + if (vmstat_item_in_bytes(item)) > > + x >>= PAGE_SHIFT; > > return x; > > } > > > > unsigned long node_page_state(struct pglist_data *pgdat, > > enum node_stat_item item) > > { > > + long x = atomic_long_read(&pgdat->vm_stat[item]); > > + > > VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); > > > > - return node_page_state_pages(pgdat, item); > > +#ifdef CONFIG_SMP > > + if (x < 0) > > + x = 0; > > +#endif > > + return x; > > } > > #endif > > > > -- > > 2.11.0 > >
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 322dcbfcc933..fd1a3d5d4926 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -197,18 +197,26 @@ static inline unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); + #ifdef CONFIG_SMP if (x < 0) x = 0; #endif + if (vmstat_item_in_bytes(item)) + x >>= PAGE_SHIFT; return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { - VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); + long x = atomic_long_read(&vm_node_stat[item]); - return global_node_page_state_pages(item); + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline unsigned long zone_page_state(struct zone *zone, @@ -312,11 +320,6 @@ static inline void __mod_zone_page_state(struct zone *zone, static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { - if (vmstat_item_in_bytes(item)) { - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); - delta >>= PAGE_SHIFT; - } - node_page_state_add(delta, pgdat, item); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 8d77ee426e22..7fb0c7cb9516 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -345,11 +345,6 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, long x; long t; - if (vmstat_item_in_bytes(item)) { - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); - delta >>= PAGE_SHIFT; - } - x = delta + __this_cpu_read(*p); t = __this_cpu_read(pcp->stat_threshold); @@ -554,11 +549,6 @@ static inline void mod_node_state(struct pglist_data *pgdat, s8 __percpu *p = pcp->vm_node_stat_diff + item; long o, n, t, z; - if (vmstat_item_in_bytes(item)) { - VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); - delta >>= PAGE_SHIFT; - } - do { z = 0; /* overflow to node counters */ @@ -1012,19 +1002,28 @@ unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item) { long x = atomic_long_read(&pgdat->vm_stat[item]); + #ifdef CONFIG_SMP if (x < 0) x = 0; #endif + if (vmstat_item_in_bytes(item)) + x >>= PAGE_SHIFT; return x; } unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item) { + long x = atomic_long_read(&pgdat->vm_stat[item]); + VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); - return node_page_state_pages(pgdat, item); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } #endif
the global and per-node counters are stored in pages, however memcg and lruvec counters are stored in bytes. This scheme looks weird. So convert all vmstat slab counters to bytes. Signed-off-by: Muchun Song <songmuchun@bytedance.com> --- include/linux/vmstat.h | 17 ++++++++++------- mm/vmstat.c | 21 ++++++++++----------- 2 files changed, 20 insertions(+), 18 deletions(-)