Message ID | 20210420192907.30880-3-longman@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm/memcg: Reduce kmemcache memory accounting overhead | expand |
On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote: > Before the new slab memory controller with per object byte charging, > charging and vmstat data update happen only when new slab pages are > allocated or freed. Now they are done with every kmem_cache_alloc() > and kmem_cache_free(). This causes additional overhead for workloads > that generate a lot of alloc and free calls. > > The memcg_stock_pcp is used to cache byte charge for a specific > obj_cgroup to reduce that overhead. To further reducing it, this patch > makes the vmstat data cached in the memcg_stock_pcp structure as well > until it accumulates a page size worth of update or when other cached > data change. Caching the vmstat data in the per-cpu stock eliminates two > writes to non-hot cachelines for memcg specific as well as memcg-lruvecs > specific vmstat data by a write to a hot local stock cacheline. > > On a 2-socket Cascade Lake server with instrumentation enabled and this > patch applied, it was found that about 20% (634400 out of 3243830) > of the time when mod_objcg_state() is called leads to an actual call > to __mod_objcg_state() after initial boot. When doing parallel kernel > build, the figure was about 17% (24329265 out of 142512465). So caching > the vmstat data reduces the number of calls to __mod_objcg_state() > by more than 80%. > > Signed-off-by: Waiman Long <longman@redhat.com> > Reviewed-by: Shakeel Butt <shakeelb@google.com> > --- > mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 83 insertions(+), 3 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 7cd7187a017c..292b4783b1a7 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) > rcu_read_unlock(); > } > > -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > - enum node_stat_item idx, int nr) > +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, > + struct pglist_data *pgdat, > + enum node_stat_item idx, int nr) > { > struct mem_cgroup *memcg; > struct lruvec *lruvec; > @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > rcu_read_lock(); > memcg = obj_cgroup_memcg(objcg); > lruvec = mem_cgroup_lruvec(memcg, pgdat); > - mod_memcg_lruvec_state(lruvec, idx, nr); > + __mod_memcg_lruvec_state(lruvec, idx, nr); > rcu_read_unlock(); > } > > @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { > > #ifdef CONFIG_MEMCG_KMEM > struct obj_cgroup *cached_objcg; > + struct pglist_data *cached_pgdat; I wonder if we want to have per-node counters instead? That would complicate the initialization of pcp stocks a bit, but might shave off some additional cpu time. But we can do it later too. > unsigned int nr_bytes; > + int nr_slab_reclaimable_b; > + int nr_slab_unreclaimable_b; > #endif > > struct work_struct work; > @@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) > obj_cgroup_put(objcg); > } > > +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > + enum node_stat_item idx, int nr) > +{ > + struct memcg_stock_pcp *stock; > + unsigned long flags; > + int *bytes; > + > + local_irq_save(flags); > + stock = this_cpu_ptr(&memcg_stock); > + > + /* > + * Save vmstat data in stock and skip vmstat array update unless > + * accumulating over a page of vmstat data or when pgdat or idx > + * changes. > + */ > + if (stock->cached_objcg != objcg) { > + drain_obj_stock(stock); > + obj_cgroup_get(objcg); > + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) > + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; > + stock->cached_objcg = objcg; > + stock->cached_pgdat = pgdat; > + } else if (stock->cached_pgdat != pgdat) { > + /* Flush the existing cached vmstat data */ > + if (stock->nr_slab_reclaimable_b) { > + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, > + stock->nr_slab_reclaimable_b); > + stock->nr_slab_reclaimable_b = 0; > + } > + if (stock->nr_slab_unreclaimable_b) { > + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, > + stock->nr_slab_unreclaimable_b); > + stock->nr_slab_unreclaimable_b = 0; > + } > + stock->cached_pgdat = pgdat; > + } > + > + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b > + : &stock->nr_slab_unreclaimable_b; > + if (!*bytes) { > + *bytes = nr; > + nr = 0; > + } else { > + *bytes += nr; > + if (abs(*bytes) > PAGE_SIZE) { > + nr = *bytes; > + *bytes = 0; > + } else { > + nr = 0; > + } > + } This part is a little bit hard to follow, how about something like this (completely untested): { stocked = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b : &stock->nr_slab_unreclaimable_b; if (abs(*stocked + nr) > PAGE_SIZE) { nr += *stocked; *stocked = 0; } else { *stocked += nr; nr = 0; } } > + if (nr) > + mod_objcg_mlstate(objcg, pgdat, idx, nr); > + > + local_irq_restore(flags); > +} > + > static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) > { > struct memcg_stock_pcp *stock; > @@ -3055,6 +3116,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) > stock->nr_bytes = 0; > } > > + /* > + * Flush the vmstat data in current stock > + */ > + if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { > + if (stock->nr_slab_reclaimable_b) { > + mod_objcg_mlstate(old, stock->cached_pgdat, > + NR_SLAB_RECLAIMABLE_B, > + stock->nr_slab_reclaimable_b); > + stock->nr_slab_reclaimable_b = 0; > + } > + if (stock->nr_slab_unreclaimable_b) { > + mod_objcg_mlstate(old, stock->cached_pgdat, > + NR_SLAB_UNRECLAIMABLE_B, > + stock->nr_slab_unreclaimable_b); > + stock->nr_slab_unreclaimable_b = 0; > + } > + stock->cached_pgdat = NULL; > + } > + > obj_cgroup_put(old); > stock->cached_objcg = NULL; > } > -- > 2.18.1 >
On 4/21/21 7:28 PM, Roman Gushchin wrote: > On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote: >> Before the new slab memory controller with per object byte charging, >> charging and vmstat data update happen only when new slab pages are >> allocated or freed. Now they are done with every kmem_cache_alloc() >> and kmem_cache_free(). This causes additional overhead for workloads >> that generate a lot of alloc and free calls. >> >> The memcg_stock_pcp is used to cache byte charge for a specific >> obj_cgroup to reduce that overhead. To further reducing it, this patch >> makes the vmstat data cached in the memcg_stock_pcp structure as well >> until it accumulates a page size worth of update or when other cached >> data change. Caching the vmstat data in the per-cpu stock eliminates two >> writes to non-hot cachelines for memcg specific as well as memcg-lruvecs >> specific vmstat data by a write to a hot local stock cacheline. >> >> On a 2-socket Cascade Lake server with instrumentation enabled and this >> patch applied, it was found that about 20% (634400 out of 3243830) >> of the time when mod_objcg_state() is called leads to an actual call >> to __mod_objcg_state() after initial boot. When doing parallel kernel >> build, the figure was about 17% (24329265 out of 142512465). So caching >> the vmstat data reduces the number of calls to __mod_objcg_state() >> by more than 80%. >> >> Signed-off-by: Waiman Long <longman@redhat.com> >> Reviewed-by: Shakeel Butt <shakeelb@google.com> >> --- >> mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 83 insertions(+), 3 deletions(-) >> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index 7cd7187a017c..292b4783b1a7 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) >> rcu_read_unlock(); >> } >> >> -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, >> - enum node_stat_item idx, int nr) >> +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, >> + struct pglist_data *pgdat, >> + enum node_stat_item idx, int nr) >> { >> struct mem_cgroup *memcg; >> struct lruvec *lruvec; >> @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, >> rcu_read_lock(); >> memcg = obj_cgroup_memcg(objcg); >> lruvec = mem_cgroup_lruvec(memcg, pgdat); >> - mod_memcg_lruvec_state(lruvec, idx, nr); >> + __mod_memcg_lruvec_state(lruvec, idx, nr); >> rcu_read_unlock(); >> } >> >> @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { >> >> #ifdef CONFIG_MEMCG_KMEM >> struct obj_cgroup *cached_objcg; >> + struct pglist_data *cached_pgdat; > I wonder if we want to have per-node counters instead? > That would complicate the initialization of pcp stocks a bit, > but might shave off some additional cpu time. > But we can do it later too. > A per node counter will certainly complicate the code and reduce the performance benefit too. I got a pretty good hit rate of 80%+ with the current code on a 2-socket system. The hit rate will probably drop when there are more nodes. I will do some more investigation, but it will not be for this patchset. >> unsigned int nr_bytes; >> + int nr_slab_reclaimable_b; >> + int nr_slab_unreclaimable_b; >> #endif >> >> struct work_struct work; >> @@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) >> obj_cgroup_put(objcg); >> } >> >> +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, >> + enum node_stat_item idx, int nr) >> +{ >> + struct memcg_stock_pcp *stock; >> + unsigned long flags; >> + int *bytes; >> + >> + local_irq_save(flags); >> + stock = this_cpu_ptr(&memcg_stock); >> + >> + /* >> + * Save vmstat data in stock and skip vmstat array update unless >> + * accumulating over a page of vmstat data or when pgdat or idx >> + * changes. >> + */ >> + if (stock->cached_objcg != objcg) { >> + drain_obj_stock(stock); >> + obj_cgroup_get(objcg); >> + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) >> + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; >> + stock->cached_objcg = objcg; >> + stock->cached_pgdat = pgdat; >> + } else if (stock->cached_pgdat != pgdat) { >> + /* Flush the existing cached vmstat data */ >> + if (stock->nr_slab_reclaimable_b) { >> + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, >> + stock->nr_slab_reclaimable_b); >> + stock->nr_slab_reclaimable_b = 0; >> + } >> + if (stock->nr_slab_unreclaimable_b) { >> + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, >> + stock->nr_slab_unreclaimable_b); >> + stock->nr_slab_unreclaimable_b = 0; >> + } >> + stock->cached_pgdat = pgdat; >> + } >> + >> + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b >> + : &stock->nr_slab_unreclaimable_b; >> + if (!*bytes) { >> + *bytes = nr; >> + nr = 0; >> + } else { >> + *bytes += nr; >> + if (abs(*bytes) > PAGE_SIZE) { >> + nr = *bytes; >> + *bytes = 0; >> + } else { >> + nr = 0; >> + } >> + } > This part is a little bit hard to follow, how about something like this > (completely untested): > > { > stocked = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b > : &stock->nr_slab_unreclaimable_b; > if (abs(*stocked + nr) > PAGE_SIZE) { > nr += *stocked; > *stocked = 0; > } else { > *stocked += nr; > nr = 0; > } > } That was done purposely to make sure that large object (>= 4k) will also be cached once before flushing it out. I should have been more clear about that by adding a comment about it. vmstat data isn't as critical as memory charge and so I am allowing it to cache more than 4k in this case. Cheers, Longman
On Thu, Apr 22, 2021 at 12:58:52PM -0400, Waiman Long wrote: > On 4/21/21 7:28 PM, Roman Gushchin wrote: > > On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote: > > > Before the new slab memory controller with per object byte charging, > > > charging and vmstat data update happen only when new slab pages are > > > allocated or freed. Now they are done with every kmem_cache_alloc() > > > and kmem_cache_free(). This causes additional overhead for workloads > > > that generate a lot of alloc and free calls. > > > > > > The memcg_stock_pcp is used to cache byte charge for a specific > > > obj_cgroup to reduce that overhead. To further reducing it, this patch > > > makes the vmstat data cached in the memcg_stock_pcp structure as well > > > until it accumulates a page size worth of update or when other cached > > > data change. Caching the vmstat data in the per-cpu stock eliminates two > > > writes to non-hot cachelines for memcg specific as well as memcg-lruvecs > > > specific vmstat data by a write to a hot local stock cacheline. > > > > > > On a 2-socket Cascade Lake server with instrumentation enabled and this > > > patch applied, it was found that about 20% (634400 out of 3243830) > > > of the time when mod_objcg_state() is called leads to an actual call > > > to __mod_objcg_state() after initial boot. When doing parallel kernel > > > build, the figure was about 17% (24329265 out of 142512465). So caching > > > the vmstat data reduces the number of calls to __mod_objcg_state() > > > by more than 80%. > > > > > > Signed-off-by: Waiman Long <longman@redhat.com> > > > Reviewed-by: Shakeel Butt <shakeelb@google.com> > > > --- > > > mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++-- > > > 1 file changed, 83 insertions(+), 3 deletions(-) > > > > > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > > > index 7cd7187a017c..292b4783b1a7 100644 > > > --- a/mm/memcontrol.c > > > +++ b/mm/memcontrol.c > > > @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) > > > rcu_read_unlock(); > > > } > > > -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > > > - enum node_stat_item idx, int nr) > > > +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, > > > + struct pglist_data *pgdat, > > > + enum node_stat_item idx, int nr) > > > { > > > struct mem_cgroup *memcg; > > > struct lruvec *lruvec; > > > @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, > > > rcu_read_lock(); > > > memcg = obj_cgroup_memcg(objcg); > > > lruvec = mem_cgroup_lruvec(memcg, pgdat); > > > - mod_memcg_lruvec_state(lruvec, idx, nr); > > > + __mod_memcg_lruvec_state(lruvec, idx, nr); > > > rcu_read_unlock(); > > > } > > > @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { > > > #ifdef CONFIG_MEMCG_KMEM > > > struct obj_cgroup *cached_objcg; > > > + struct pglist_data *cached_pgdat; > > I wonder if we want to have per-node counters instead? > > That would complicate the initialization of pcp stocks a bit, > > but might shave off some additional cpu time. > > But we can do it later too. > > > A per node counter will certainly complicate the code and reduce the > performance benefit too. Hm, why? We wouldn't need to flush the stock if the release happens on some other cpu not matching the current pgdat. > I got a pretty good hit rate of 80%+ with the > current code on a 2-socket system. The hit rate will probably drop when > there are more nodes. I will do some more investigation, but it will not be > for this patchset. Works for me! Thanks!
On 4/22/21 9:56 PM, Roman Gushchin wrote: > On Thu, Apr 22, 2021 at 12:58:52PM -0400, Waiman Long wrote: >> On 4/21/21 7:28 PM, Roman Gushchin wrote: >>> On Tue, Apr 20, 2021 at 03:29:05PM -0400, Waiman Long wrote: >>>> Before the new slab memory controller with per object byte charging, >>>> charging and vmstat data update happen only when new slab pages are >>>> allocated or freed. Now they are done with every kmem_cache_alloc() >>>> and kmem_cache_free(). This causes additional overhead for workloads >>>> that generate a lot of alloc and free calls. >>>> >>>> The memcg_stock_pcp is used to cache byte charge for a specific >>>> obj_cgroup to reduce that overhead. To further reducing it, this patch >>>> makes the vmstat data cached in the memcg_stock_pcp structure as well >>>> until it accumulates a page size worth of update or when other cached >>>> data change. Caching the vmstat data in the per-cpu stock eliminates two >>>> writes to non-hot cachelines for memcg specific as well as memcg-lruvecs >>>> specific vmstat data by a write to a hot local stock cacheline. >>>> >>>> On a 2-socket Cascade Lake server with instrumentation enabled and this >>>> patch applied, it was found that about 20% (634400 out of 3243830) >>>> of the time when mod_objcg_state() is called leads to an actual call >>>> to __mod_objcg_state() after initial boot. When doing parallel kernel >>>> build, the figure was about 17% (24329265 out of 142512465). So caching >>>> the vmstat data reduces the number of calls to __mod_objcg_state() >>>> by more than 80%. >>>> >>>> Signed-off-by: Waiman Long <longman@redhat.com> >>>> Reviewed-by: Shakeel Butt <shakeelb@google.com> >>>> --- >>>> mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++-- >>>> 1 file changed, 83 insertions(+), 3 deletions(-) >>>> >>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >>>> index 7cd7187a017c..292b4783b1a7 100644 >>>> --- a/mm/memcontrol.c >>>> +++ b/mm/memcontrol.c >>>> @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) >>>> rcu_read_unlock(); >>>> } >>>> -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, >>>> - enum node_stat_item idx, int nr) >>>> +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, >>>> + struct pglist_data *pgdat, >>>> + enum node_stat_item idx, int nr) >>>> { >>>> struct mem_cgroup *memcg; >>>> struct lruvec *lruvec; >>>> @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, >>>> rcu_read_lock(); >>>> memcg = obj_cgroup_memcg(objcg); >>>> lruvec = mem_cgroup_lruvec(memcg, pgdat); >>>> - mod_memcg_lruvec_state(lruvec, idx, nr); >>>> + __mod_memcg_lruvec_state(lruvec, idx, nr); >>>> rcu_read_unlock(); >>>> } >>>> @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { >>>> #ifdef CONFIG_MEMCG_KMEM >>>> struct obj_cgroup *cached_objcg; >>>> + struct pglist_data *cached_pgdat; >>> I wonder if we want to have per-node counters instead? >>> That would complicate the initialization of pcp stocks a bit, >>> but might shave off some additional cpu time. >>> But we can do it later too. >>> >> A per node counter will certainly complicate the code and reduce the >> performance benefit too. > Hm, why? We wouldn't need to flush the stock if the release happens > on some other cpu not matching the current pgdat. I had actually experimented with just caching vmstat data for the local node only. It turned out the hit rate was a bit lower. That is why I keep the current approach and I need to do further investigation on a better approach. Cheers, Longman
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7cd7187a017c..292b4783b1a7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -782,8 +782,9 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) rcu_read_unlock(); } -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, + struct pglist_data *pgdat, + enum node_stat_item idx, int nr) { struct mem_cgroup *memcg; struct lruvec *lruvec; @@ -791,7 +792,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - mod_memcg_lruvec_state(lruvec, idx, nr); + __mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } @@ -2059,7 +2060,10 @@ struct memcg_stock_pcp { #ifdef CONFIG_MEMCG_KMEM struct obj_cgroup *cached_objcg; + struct pglist_data *cached_pgdat; unsigned int nr_bytes; + int nr_slab_reclaimable_b; + int nr_slab_unreclaimable_b; #endif struct work_struct work; @@ -3008,6 +3012,63 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) +{ + struct memcg_stock_pcp *stock; + unsigned long flags; + int *bytes; + + local_irq_save(flags); + stock = this_cpu_ptr(&memcg_stock); + + /* + * Save vmstat data in stock and skip vmstat array update unless + * accumulating over a page of vmstat data or when pgdat or idx + * changes. + */ + if (stock->cached_objcg != objcg) { + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + stock->cached_objcg = objcg; + stock->cached_pgdat = pgdat; + } else if (stock->cached_pgdat != pgdat) { + /* Flush the existing cached vmstat data */ + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(objcg, pgdat, NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->cached_pgdat = pgdat; + } + + bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b + : &stock->nr_slab_unreclaimable_b; + if (!*bytes) { + *bytes = nr; + nr = 0; + } else { + *bytes += nr; + if (abs(*bytes) > PAGE_SIZE) { + nr = *bytes; + *bytes = 0; + } else { + nr = 0; + } + } + if (nr) + mod_objcg_mlstate(objcg, pgdat, idx, nr); + + local_irq_restore(flags); +} + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) { struct memcg_stock_pcp *stock; @@ -3055,6 +3116,25 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) stock->nr_bytes = 0; } + /* + * Flush the vmstat data in current stock + */ + if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { + if (stock->nr_slab_reclaimable_b) { + mod_objcg_mlstate(old, stock->cached_pgdat, + NR_SLAB_RECLAIMABLE_B, + stock->nr_slab_reclaimable_b); + stock->nr_slab_reclaimable_b = 0; + } + if (stock->nr_slab_unreclaimable_b) { + mod_objcg_mlstate(old, stock->cached_pgdat, + NR_SLAB_UNRECLAIMABLE_B, + stock->nr_slab_unreclaimable_b); + stock->nr_slab_unreclaimable_b = 0; + } + stock->cached_pgdat = NULL; + } + obj_cgroup_put(old); stock->cached_objcg = NULL; }