diff mbox series

[v3,09/19] mm: memcg/slab: charge individual slab objects instead of pages

Message ID 20200422204708.2176080-10-guro@fb.com (mailing list archive)
State New, archived
Headers show
Series The new cgroup slab memory controller | expand

Commit Message

Roman Gushchin April 22, 2020, 8:46 p.m. UTC
Switch to per-object accounting of non-root slab objects.

Charging is performed using obj_cgroup API in the pre_alloc hook.
Obj_cgroup is charged with the size of the object and the size
of metadata: as now it's the size of an obj_cgroup pointer.
If the amount of memory has been charged successfully, the actual
allocation code is executed. Otherwise, -ENOMEM is returned.

In the post_alloc hook if the actual allocation succeeded,
corresponding vmstats are bumped and the obj_cgroup pointer is saved.
Otherwise, the charge is canceled.

On the free path obj_cgroup pointer is obtained and used to uncharge
the size of the releasing object.

Memcg and lruvec counters are now representing only memory used
by active slab objects and do not include the free space. The free
space is shared and doesn't belong to any specific cgroup.

Global per-node slab vmstats are still modified from (un)charge_slab_page()
functions. The idea is to keep all slab pages accounted as slab pages
on system level.

Signed-off-by: Roman Gushchin <guro@fb.com>
---
 mm/slab.h | 173 ++++++++++++++++++++++++------------------------------
 1 file changed, 77 insertions(+), 96 deletions(-)

Comments

Vlastimil Babka May 25, 2020, 4:10 p.m. UTC | #1
On 4/22/20 10:46 PM, Roman Gushchin wrote:
> Switch to per-object accounting of non-root slab objects.
> 
> Charging is performed using obj_cgroup API in the pre_alloc hook.
> Obj_cgroup is charged with the size of the object and the size
> of metadata: as now it's the size of an obj_cgroup pointer.
> If the amount of memory has been charged successfully, the actual
> allocation code is executed. Otherwise, -ENOMEM is returned.
> 
> In the post_alloc hook if the actual allocation succeeded,
> corresponding vmstats are bumped and the obj_cgroup pointer is saved.
> Otherwise, the charge is canceled.
> 
> On the free path obj_cgroup pointer is obtained and used to uncharge
> the size of the releasing object.
> 
> Memcg and lruvec counters are now representing only memory used
> by active slab objects and do not include the free space. The free
> space is shared and doesn't belong to any specific cgroup.
> 
> Global per-node slab vmstats are still modified from (un)charge_slab_page()
> functions. The idea is to keep all slab pages accounted as slab pages
> on system level.
> 
> Signed-off-by: Roman Gushchin <guro@fb.com>

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

Suggestion below:

> @@ -568,32 +548,33 @@ static __always_inline int charge_slab_page(struct page *page,
>  					    gfp_t gfp, int order,
>  					    struct kmem_cache *s)
>  {
> -	int ret;
> -
> -	if (is_root_cache(s)) {
> -		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> -				    PAGE_SIZE << order);
> -		return 0;
> -	}
> +#ifdef CONFIG_MEMCG_KMEM
> +	if (!is_root_cache(s)) {

This could also benefit from memcg_kmem_enabled() static key test AFAICS. Maybe
even have a wrapper for both tests together?

> +		int ret;
>  
> -	ret = memcg_alloc_page_obj_cgroups(page, gfp, objs_per_slab(s));
> -	if (ret)
> -		return ret;
> +		ret = memcg_alloc_page_obj_cgroups(page, gfp, objs_per_slab(s));

You created memcg_alloc_page_obj_cgroups() empty variant for !CONFIG_MEMCG_KMEM
but now the only caller is under CONFIG_MEMCG_KMEM.

> +		if (ret)
> +			return ret;
>  
> -	return memcg_charge_slab(page, gfp, order, s);
> +		percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);

Perhaps moving this refcount into memcg_alloc_page_obj_cgroups() (maybe the name
should be different then) will allow you to not add #ifdef CONFIG_MEMCG_KMEM in
this function.

Maybe this is all moot after patch 12/19, will find out :)

> +	}
> +#endif
> +	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> +			    PAGE_SIZE << order);
> +	return 0;
>  }
>  
>  static __always_inline void uncharge_slab_page(struct page *page, int order,
>  					       struct kmem_cache *s)
>  {
> -	if (is_root_cache(s)) {
> -		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> -				    -(PAGE_SIZE << order));
> -		return;
> +#ifdef CONFIG_MEMCG_KMEM
> +	if (!is_root_cache(s)) {

Everything from above also applies here.

> +		memcg_free_page_obj_cgroups(page);
> +		percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
>  	}
> -
> -	memcg_free_page_obj_cgroups(page);
> -	memcg_uncharge_slab(page, order, s);
> +#endif
> +	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> +			    -(PAGE_SIZE << order));
>  }
>  
>  static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
Roman Gushchin May 26, 2020, 6:04 p.m. UTC | #2
On Mon, May 25, 2020 at 06:10:55PM +0200, Vlastimil Babka wrote:
> On 4/22/20 10:46 PM, Roman Gushchin wrote:
> > Switch to per-object accounting of non-root slab objects.
> > 
> > Charging is performed using obj_cgroup API in the pre_alloc hook.
> > Obj_cgroup is charged with the size of the object and the size
> > of metadata: as now it's the size of an obj_cgroup pointer.
> > If the amount of memory has been charged successfully, the actual
> > allocation code is executed. Otherwise, -ENOMEM is returned.
> > 
> > In the post_alloc hook if the actual allocation succeeded,
> > corresponding vmstats are bumped and the obj_cgroup pointer is saved.
> > Otherwise, the charge is canceled.
> > 
> > On the free path obj_cgroup pointer is obtained and used to uncharge
> > the size of the releasing object.
> > 
> > Memcg and lruvec counters are now representing only memory used
> > by active slab objects and do not include the free space. The free
> > space is shared and doesn't belong to any specific cgroup.
> > 
> > Global per-node slab vmstats are still modified from (un)charge_slab_page()
> > functions. The idea is to keep all slab pages accounted as slab pages
> > on system level.
> > 
> > Signed-off-by: Roman Gushchin <guro@fb.com>
> 
> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Suggestion below:
> 
> > @@ -568,32 +548,33 @@ static __always_inline int charge_slab_page(struct page *page,
> >  					    gfp_t gfp, int order,
> >  					    struct kmem_cache *s)
> >  {
> > -	int ret;
> > -
> > -	if (is_root_cache(s)) {
> > -		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> > -				    PAGE_SIZE << order);
> > -		return 0;
> > -	}
> > +#ifdef CONFIG_MEMCG_KMEM
> > +	if (!is_root_cache(s)) {
> 
> This could also benefit from memcg_kmem_enabled() static key test AFAICS. Maybe
> even have a wrapper for both tests together?

Added.

> 
> > +		int ret;
> >  
> > -	ret = memcg_alloc_page_obj_cgroups(page, gfp, objs_per_slab(s));
> > -	if (ret)
> > -		return ret;
> > +		ret = memcg_alloc_page_obj_cgroups(page, gfp, objs_per_slab(s));
> 
> You created memcg_alloc_page_obj_cgroups() empty variant for !CONFIG_MEMCG_KMEM
> but now the only caller is under CONFIG_MEMCG_KMEM.

Good catch, thanks!

> 
> > +		if (ret)
> > +			return ret;
> >  
> > -	return memcg_charge_slab(page, gfp, order, s);
> > +		percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
> 
> Perhaps moving this refcount into memcg_alloc_page_obj_cgroups() (maybe the name
> should be different then) will allow you to not add #ifdef CONFIG_MEMCG_KMEM in
> this function.

The reference counter bumping is not related to obj_cgroups,
we just bump a counter for each slab page belonging to the kmem_cache.
And it will go away later in the patchset with the rest of slab caches
refcounting.

> 
> Maybe this is all moot after patch 12/19, will find out :)
> 
> > +	}
> > +#endif
> > +	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> > +			    PAGE_SIZE << order);
> > +	return 0;
> >  }
> >  
> >  static __always_inline void uncharge_slab_page(struct page *page, int order,
> >  					       struct kmem_cache *s)
> >  {
> > -	if (is_root_cache(s)) {
> > -		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> > -				    -(PAGE_SIZE << order));
> > -		return;
> > +#ifdef CONFIG_MEMCG_KMEM
> > +	if (!is_root_cache(s)) {
> 
> Everything from above also applies here.

Done.
Thanks!

> 
> > +		memcg_free_page_obj_cgroups(page);
> > +		percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
> >  	}
> > -
> > -	memcg_free_page_obj_cgroups(page);
> > -	memcg_uncharge_slab(page, order, s);
> > +#endif
> > +	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
> > +			    -(PAGE_SIZE << order));
> >  }
> >  
> >  static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
> 
>
diff mbox series

Patch

diff --git a/mm/slab.h b/mm/slab.h
index 525e09e05743..0ecf14bec6a2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -352,72 +352,6 @@  static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
 	return NULL;
 }
 
-/*
- * Charge the slab page belonging to the non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline int memcg_charge_slab(struct page *page,
-					     gfp_t gfp, int order,
-					     struct kmem_cache *s)
-{
-	unsigned int nr_pages = 1 << order;
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
-	int ret;
-
-	rcu_read_lock();
-	memcg = READ_ONCE(s->memcg_params.memcg);
-	while (memcg && !css_tryget_online(&memcg->css))
-		memcg = parent_mem_cgroup(memcg);
-	rcu_read_unlock();
-
-	if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
-		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    nr_pages << PAGE_SHIFT);
-		percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
-		return 0;
-	}
-
-	ret = memcg_kmem_charge(memcg, gfp, nr_pages);
-	if (ret)
-		goto out;
-
-	lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
-	mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages << PAGE_SHIFT);
-
-	percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
-out:
-	css_put(&memcg->css);
-	return ret;
-}
-
-/*
- * Uncharge a slab page belonging to a non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline void memcg_uncharge_slab(struct page *page, int order,
-						struct kmem_cache *s)
-{
-	unsigned int nr_pages = 1 << order;
-	struct mem_cgroup *memcg;
-	struct lruvec *lruvec;
-
-	rcu_read_lock();
-	memcg = READ_ONCE(s->memcg_params.memcg);
-	if (likely(!mem_cgroup_is_root(memcg))) {
-		lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
-		mod_lruvec_state(lruvec, cache_vmstat_idx(s),
-				 -(nr_pages << PAGE_SHIFT));
-		memcg_kmem_uncharge(memcg, nr_pages);
-	} else {
-		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    -(nr_pages << PAGE_SHIFT));
-	}
-	rcu_read_unlock();
-
-	percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages);
-}
-
 static inline int memcg_alloc_page_obj_cgroups(struct page *page, gfp_t gfp,
 					       unsigned int objects)
 {
@@ -437,6 +371,47 @@  static inline void memcg_free_page_obj_cgroups(struct page *page)
 	page->obj_cgroups = NULL;
 }
 
+static inline size_t obj_full_size(struct kmem_cache *s)
+{
+	/*
+	 * For each accounted object there is an extra space which is used
+	 * to store obj_cgroup membership. Charge it too.
+	 */
+	return s->size + sizeof(struct obj_cgroup *);
+}
+
+static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+						struct obj_cgroup **objcgp,
+						size_t objects, gfp_t flags)
+{
+	struct kmem_cache *cachep;
+
+	cachep = memcg_kmem_get_cache(s, objcgp);
+	if (is_root_cache(cachep))
+		return s;
+
+	if (obj_cgroup_charge(*objcgp, flags, objects * obj_full_size(s))) {
+		memcg_kmem_put_cache(cachep);
+		cachep = NULL;
+	}
+
+	return cachep;
+}
+
+static inline void mod_objcg_state(struct obj_cgroup *objcg,
+				   struct pglist_data *pgdat,
+				   int idx, int nr)
+{
+	struct mem_cgroup *memcg;
+	struct lruvec *lruvec;
+
+	rcu_read_lock();
+	memcg = obj_cgroup_memcg(objcg);
+	lruvec = mem_cgroup_lruvec(memcg, pgdat);
+	mod_memcg_lruvec_state(lruvec, idx, nr);
+	rcu_read_unlock();
+}
+
 static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 					      struct obj_cgroup *objcg,
 					      size_t size, void **p)
@@ -451,6 +426,10 @@  static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 			off = obj_to_index(s, page, p[i]);
 			obj_cgroup_get(objcg);
 			page_obj_cgroups(page)[off] = objcg;
+			mod_objcg_state(objcg, page_pgdat(page),
+					cache_vmstat_idx(s), obj_full_size(s));
+		} else {
+			obj_cgroup_uncharge(objcg, obj_full_size(s));
 		}
 	}
 	obj_cgroup_put(objcg);
@@ -469,6 +448,11 @@  static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
 	off = obj_to_index(s, page, p);
 	objcg = page_obj_cgroups(page)[off];
 	page_obj_cgroups(page)[off] = NULL;
+
+	obj_cgroup_uncharge(objcg, obj_full_size(s));
+	mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+			-obj_full_size(s));
+
 	obj_cgroup_put(objcg);
 }
 
@@ -510,17 +494,6 @@  static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
 	return NULL;
 }
 
-static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
-				    struct kmem_cache *s)
-{
-	return 0;
-}
-
-static inline void memcg_uncharge_slab(struct page *page, int order,
-				       struct kmem_cache *s)
-{
-}
-
 static inline int memcg_alloc_page_obj_cgroups(struct page *page, gfp_t gfp,
 					       unsigned int objects)
 {
@@ -531,6 +504,13 @@  static inline void memcg_free_page_obj_cgroups(struct page *page)
 {
 }
 
+static inline struct kmem_cache *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+						struct obj_cgroup **objcgp,
+						size_t objects, gfp_t flags)
+{
+	return NULL;
+}
+
 static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
 					      struct obj_cgroup *objcg,
 					      size_t size, void **p)
@@ -568,32 +548,33 @@  static __always_inline int charge_slab_page(struct page *page,
 					    gfp_t gfp, int order,
 					    struct kmem_cache *s)
 {
-	int ret;
-
-	if (is_root_cache(s)) {
-		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    PAGE_SIZE << order);
-		return 0;
-	}
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s)) {
+		int ret;
 
-	ret = memcg_alloc_page_obj_cgroups(page, gfp, objs_per_slab(s));
-	if (ret)
-		return ret;
+		ret = memcg_alloc_page_obj_cgroups(page, gfp, objs_per_slab(s));
+		if (ret)
+			return ret;
 
-	return memcg_charge_slab(page, gfp, order, s);
+		percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+	}
+#endif
+	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+			    PAGE_SIZE << order);
+	return 0;
 }
 
 static __always_inline void uncharge_slab_page(struct page *page, int order,
 					       struct kmem_cache *s)
 {
-	if (is_root_cache(s)) {
-		mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
-				    -(PAGE_SIZE << order));
-		return;
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s)) {
+		memcg_free_page_obj_cgroups(page);
+		percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
 	}
-
-	memcg_free_page_obj_cgroups(page);
-	memcg_uncharge_slab(page, order, s);
+#endif
+	mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+			    -(PAGE_SIZE << order));
 }
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
@@ -665,7 +646,7 @@  static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
 
 	if (memcg_kmem_enabled() &&
 	    ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
-		return memcg_kmem_get_cache(s, objcgp);
+		return memcg_slab_pre_alloc_hook(s, objcgp, size, flags);
 
 	return s;
 }