diff mbox series

[v3,3/4] mm: memcontrol: use obj_cgroup APIs to charge kmem pages

Message ID 20210309100717.253-4-songmuchun@bytedance.com (mailing list archive)
State New, archived
Headers show
Series Use obj_cgroup APIs to charge kmem pages | expand

Commit Message

Muchun Song March 9, 2021, 10:07 a.m. UTC
Since Roman series "The new cgroup slab memory controller" applied. All
slab objects are charged via the new APIs of obj_cgroup. The new APIs
introduce a struct obj_cgroup to charge slab objects. It prevents
long-living objects from pinning the original memory cgroup in the memory.
But there are still some corner objects (e.g. allocations larger than
order-1 page on SLUB) which are not charged via the new APIs. Those
objects (include the pages which are allocated from buddy allocator
directly) are charged as kmem pages which still hold a reference to
the memory cgroup.

This patch aims to charge the kmem pages by using the new APIs of
obj_cgroup. Finally, the page->memcg_data of the kmem page points to
an object cgroup. We can use the page_objcg() to get the object
cgroup associated with a kmem page. Or we can use page_memcg_check()
to get the memory cgroup associated with a kmem page, but caller must
ensure that the returned memcg won't be released (e.g. acquire the
rcu_read_lock or css_set_lock).

Signed-off-by: Muchun Song <songmuchun@bytedance.com>
---
 include/linux/memcontrol.h |  63 ++++++++++++++++++------
 mm/memcontrol.c            | 119 ++++++++++++++++++++++++++++++---------------
 2 files changed, 128 insertions(+), 54 deletions(-)

Comments

Roman Gushchin March 10, 2021, 7:53 p.m. UTC | #1
On Tue, Mar 09, 2021 at 06:07:16PM +0800, Muchun Song wrote:
> Since Roman series "The new cgroup slab memory controller" applied. All
> slab objects are charged via the new APIs of obj_cgroup. The new APIs
> introduce a struct obj_cgroup to charge slab objects. It prevents
> long-living objects from pinning the original memory cgroup in the memory.
> But there are still some corner objects (e.g. allocations larger than
> order-1 page on SLUB) which are not charged via the new APIs. Those
> objects (include the pages which are allocated from buddy allocator
> directly) are charged as kmem pages which still hold a reference to
> the memory cgroup.
> 
> This patch aims to charge the kmem pages by using the new APIs of
> obj_cgroup. Finally, the page->memcg_data of the kmem page points to
> an object cgroup. We can use the page_objcg() to get the object
> cgroup associated with a kmem page. Or we can use page_memcg_check()
> to get the memory cgroup associated with a kmem page, but caller must
> ensure that the returned memcg won't be released (e.g. acquire the
> rcu_read_lock or css_set_lock).
> 
> Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> ---
>  include/linux/memcontrol.h |  63 ++++++++++++++++++------
>  mm/memcontrol.c            | 119 ++++++++++++++++++++++++++++++---------------
>  2 files changed, 128 insertions(+), 54 deletions(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 83cbcdcfcc92..07c449af9c0f 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -370,6 +370,18 @@ static inline bool page_memcg_charged(struct page *page)
>  }
>  
>  /*
> + * After the initialization objcg->memcg is always pointing at
> + * a valid memcg, but can be atomically swapped to the parent memcg.
> + *
> + * The caller must ensure that the returned memcg won't be released:
> + * e.g. acquire the rcu_read_lock or css_set_lock.
> + */
> +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> +{
> +	return READ_ONCE(objcg->memcg);
> +}
> +
> +/*
>   * page_memcg - get the memory cgroup associated with a non-kmem page
>   * @page: a pointer to the page struct
>   *
> @@ -422,15 +434,19 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
>   * @page: a pointer to the page struct
>   *
>   * Returns a pointer to the memory cgroup associated with the page,
> - * or NULL. This function unlike page_memcg() can take any  page
> + * or NULL. This function unlike page_memcg() can take any page
>   * as an argument. It has to be used in cases when it's not known if a page
> - * has an associated memory cgroup pointer or an object cgroups vector.
> + * has an associated memory cgroup pointer or an object cgroups vector or
> + * an object cgroup.
>   *
>   * Any of the following ensures page and memcg binding stability:
>   * - the page lock
>   * - LRU isolation
>   * - lock_page_memcg()
>   * - exclusive reference
> + *
> + * Should be called under rcu lock which can protect memcg associated with a
> + * kmem page from being released.

How about this:

For a non-kmem page any of the following ensures page and memcg binding stability:
- the page lock
- LRU isolation
- lock_page_memcg()
- exclusive reference

For a kmem page a caller should hold an rcu read lock to protect memcg associated
with a kmem page from being released.

>   */
>  static inline struct mem_cgroup *page_memcg_check(struct page *page)
>  {
> @@ -443,6 +459,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
>  	if (memcg_data & MEMCG_DATA_OBJCGS)
>  		return NULL;
>  
> +	if (memcg_data & MEMCG_DATA_KMEM) {
> +		struct obj_cgroup *objcg;
> +
> +		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> +		return obj_cgroup_memcg(objcg);
> +	}
> +
>  	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
>  }
>  
> @@ -501,6 +524,25 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
>  	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
>  }
>  
> +/*
> + * page_objcg - get the object cgroup associated with a kmem page
> + * @page: a pointer to the page struct
> + *
> + * Returns a pointer to the object cgroup associated with the kmem page,
> + * or NULL. This function assumes that the page is known to have an
> + * associated object cgroup. It's only safe to call this function
> + * against kmem pages (PageMemcgKmem() returns true).
> + */
> +static inline struct obj_cgroup *page_objcg(struct page *page)
> +{
> +	unsigned long memcg_data = page->memcg_data;
> +
> +	VM_BUG_ON_PAGE(PageSlab(page), page);
> +	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> +	VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
> +
> +	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> +}
>  #else
>  static inline struct obj_cgroup **page_objcgs(struct page *page)
>  {
> @@ -511,6 +553,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
>  {
>  	return NULL;
>  }
> +
> +static inline struct obj_cgroup *page_objcg(struct page *page)
> +{
> +	return NULL;
> +}
>  #endif
>  
>  static __always_inline bool memcg_stat_item_in_bytes(int idx)
> @@ -729,18 +776,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
>  	percpu_ref_put(&objcg->refcnt);
>  }
>  
> -/*
> - * After the initialization objcg->memcg is always pointing at
> - * a valid memcg, but can be atomically swapped to the parent memcg.
> - *
> - * The caller must ensure that the returned memcg won't be released:
> - * e.g. acquire the rcu_read_lock or css_set_lock.
> - */
> -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> -{
> -	return READ_ONCE(objcg->memcg);
> -}
> -
>  static inline void mem_cgroup_put(struct mem_cgroup *memcg)
>  {
>  	if (memcg)
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index e1dc73ceb98a..38376f9d6659 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -859,15 +859,26 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
>  	pg_data_t *pgdat = page_pgdat(page);
>  	struct lruvec *lruvec;
>  
> -	memcg = page_memcg_check(head);
> -	/* Untracked pages have no memcg, no lruvec. Update only the node */
> -	if (!memcg) {
> -		__mod_node_page_state(pgdat, idx, val);
> -		return;
> +	if (PageMemcgKmem(head)) {
> +		rcu_read_lock();
> +		memcg = obj_cgroup_memcg(page_objcg(page));
> +	} else {
> +		memcg = page_memcg(head);
> +		/*
> +		 * Untracked pages have no memcg, no lruvec. Update only the
> +		 * node.
> +		 */
> +		if (!memcg) {
> +			__mod_node_page_state(pgdat, idx, val);
> +			return;
> +		}
>  	}
>  
>  	lruvec = mem_cgroup_lruvec(memcg, pgdat);
>  	__mod_lruvec_state(lruvec, idx, val);
> +
> +	if (PageMemcgKmem(head))
> +		rcu_read_unlock();
>  }
>  EXPORT_SYMBOL(__mod_lruvec_page_state);
>  
> @@ -2906,6 +2917,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
>  	page->memcg_data = (unsigned long)memcg;
>  }
>  
> +static inline struct mem_cgroup *obj_cgroup_memcg_get(struct obj_cgroup *objcg)

I'd prefer get_obj_cgroup_memcg(), if you don't mind.

> +{
> +	struct mem_cgroup *memcg;
> +
> +	rcu_read_lock();
> +retry:
> +	memcg = obj_cgroup_memcg(objcg);
> +	if (unlikely(!css_tryget(&memcg->css)))
> +		goto retry;
> +	rcu_read_unlock();
> +
> +	return memcg;
> +}
> +
>  #ifdef CONFIG_MEMCG_KMEM
>  int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
>  				 gfp_t gfp, bool new_page)
> @@ -3071,15 +3096,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
>  	struct mem_cgroup *memcg;
>  	int ret;
>  
> -	rcu_read_lock();
> -retry:
> -	memcg = obj_cgroup_memcg(objcg);
> -	if (unlikely(!css_tryget(&memcg->css)))
> -		goto retry;
> -	rcu_read_unlock();
> -
> +	memcg = obj_cgroup_memcg_get(objcg);
>  	ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
> -
>  	css_put(&memcg->css);
>  
>  	return ret;
> @@ -3144,18 +3162,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
>   */
>  int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
>  {
> -	struct mem_cgroup *memcg;
> +	struct obj_cgroup *objcg;
>  	int ret = 0;
>  
> -	memcg = get_mem_cgroup_from_current();
> -	if (memcg && !mem_cgroup_is_root(memcg)) {
> -		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
> +	objcg = get_obj_cgroup_from_current();
> +	if (objcg) {
> +		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
>  		if (!ret) {
> -			page->memcg_data = (unsigned long)memcg |
> +			page->memcg_data = (unsigned long)objcg |
>  				MEMCG_DATA_KMEM;
>  			return 0;
>  		}
> -		css_put(&memcg->css);
> +		obj_cgroup_put(objcg);
>  	}
>  	return ret;
>  }
> @@ -3167,17 +3185,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
>   */
>  void __memcg_kmem_uncharge_page(struct page *page, int order)
>  {
> -	struct mem_cgroup *memcg;
> +	struct obj_cgroup *objcg;
>  	unsigned int nr_pages = 1 << order;
>  
>  	if (!page_memcg_charged(page))
>  		return;
>  
> -	memcg = page_memcg_check(page);
> -	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
> -	__memcg_kmem_uncharge(memcg, nr_pages);
> +	objcg = page_objcg(page);
> +	obj_cgroup_uncharge_pages(objcg, nr_pages);
>  	page->memcg_data = 0;
> -	css_put(&memcg->css);
> +	obj_cgroup_put(objcg);
>  }
>  
>  static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> @@ -6806,11 +6823,23 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
>  static void uncharge_batch(const struct uncharge_gather *ug)
>  {
>  	unsigned long flags;
> +	unsigned long nr_pages;
>  
> -	if (!mem_cgroup_is_root(ug->memcg)) {
> -		page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
> +	/*
> +	 * The kmem pages can be reparented to the root memcg, in
> +	 * order to prevent the memory counter of root memcg from
> +	 * increasing indefinitely. We should decrease the memory
> +	 * counter when unchange.

I guess the correct syntax is
"The kmem pages can be reparented to the root memcg. In
order to prevent the memory counter of root memcg from
increasing indefinitely, we should decrease the memory
counter when unchange."

> +	 */
> +	if (mem_cgroup_is_root(ug->memcg))
> +		nr_pages = ug->nr_kmem;
> +	else
> +		nr_pages = ug->nr_pages;
> +
> +	if (nr_pages) {
> +		page_counter_uncharge(&ug->memcg->memory, nr_pages);
>  		if (do_memsw_account())
> -			page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
> +			page_counter_uncharge(&ug->memcg->memsw, nr_pages);
>  		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
>  			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
>  		memcg_oom_recover(ug->memcg);
> @@ -6828,7 +6857,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
>  
>  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
>  {
> -	unsigned long nr_pages;
> +	unsigned long nr_pages, nr_kmem;
>  	struct mem_cgroup *memcg;
>  
>  	VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -6836,34 +6865,44 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
>  	if (!page_memcg_charged(page))
>  		return;
>  
> +	nr_pages = compound_nr(page);
>  	/*
>  	 * Nobody should be changing or seriously looking at
> -	 * page memcg at this point, we have fully exclusive
> -	 * access to the page.
> +	 * page memcg or objcg at this point, we have fully
> +	 * exclusive access to the page.
>  	 */
> -	memcg = page_memcg_check(page);
> +	if (PageMemcgKmem(page)) {
> +		struct obj_cgroup *objcg;
> +
> +		objcg = page_objcg(page);
> +		memcg = obj_cgroup_memcg_get(objcg);
> +
> +		page->memcg_data = 0;
> +		obj_cgroup_put(objcg);
> +		nr_kmem = nr_pages;
> +	} else {
> +		memcg = page_memcg(page);
> +		page->memcg_data = 0;
> +		nr_kmem = 0;
> +	}
> +
>  	if (ug->memcg != memcg) {
>  		if (ug->memcg) {
>  			uncharge_batch(ug);
>  			uncharge_gather_clear(ug);
>  		}
>  		ug->memcg = memcg;
> +		ug->dummy_page = page;
>  
>  		/* pairs with css_put in uncharge_batch */
>  		css_get(&ug->memcg->css);
>  	}
>  
> -	nr_pages = compound_nr(page);
>  	ug->nr_pages += nr_pages;
> +	ug->nr_kmem += nr_kmem;
> +	ug->pgpgout += !nr_kmem;
>  
> -	if (PageMemcgKmem(page))
> -		ug->nr_kmem += nr_pages;
> -	else
> -		ug->pgpgout++;
> -
> -	ug->dummy_page = page;
> -	page->memcg_data = 0;
> -	css_put(&ug->memcg->css);
> +	css_put(&memcg->css);
>  }
>  
>  /**
> -- 
> 2.11.0
>
Johannes Weiner March 10, 2021, 10:05 p.m. UTC | #2
Hello Munchun,

On Tue, Mar 09, 2021 at 06:07:16PM +0800, Muchun Song wrote:
> @@ -6806,11 +6823,23 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
>  static void uncharge_batch(const struct uncharge_gather *ug)
>  {
>  	unsigned long flags;
> +	unsigned long nr_pages;
>  
> -	if (!mem_cgroup_is_root(ug->memcg)) {
> -		page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
> +	/*
> +	 * The kmem pages can be reparented to the root memcg, in
> +	 * order to prevent the memory counter of root memcg from
> +	 * increasing indefinitely. We should decrease the memory
> +	 * counter when unchange.
> +	 */
> +	if (mem_cgroup_is_root(ug->memcg))
> +		nr_pages = ug->nr_kmem;
> +	else
> +		nr_pages = ug->nr_pages;

Correct or not, I find this unreadable. We're uncharging nr_kmem on
the root, and nr_pages against leaf groups?

It implies several things that might not be immediately obvious to the
reader of this function. Namely, that nr_kmem is a subset of nr_pages.
Or that we don't *want* to account LRU pages for the root cgroup.

The old code followed a very simple pattern: the root memcg's page
counters aren't touched.

This is no longer true: we modify them depending on very specific
circumstances. But that's too clever for the stupid uncharge_batch()
which is only supposed to flush a number of accumulators into their
corresponding page counters.

This distinction really needs to be moved down to uncharge_page() now.

> @@ -6828,7 +6857,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
>  
>  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
>  {
> -	unsigned long nr_pages;
> +	unsigned long nr_pages, nr_kmem;
>  	struct mem_cgroup *memcg;
>  
>  	VM_BUG_ON_PAGE(PageLRU(page), page);
> @@ -6836,34 +6865,44 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
>  	if (!page_memcg_charged(page))
>  		return;
>  
> +	nr_pages = compound_nr(page);
>  	/*
>  	 * Nobody should be changing or seriously looking at
> -	 * page memcg at this point, we have fully exclusive
> -	 * access to the page.
> +	 * page memcg or objcg at this point, we have fully
> +	 * exclusive access to the page.
>  	 */
> -	memcg = page_memcg_check(page);
> +	if (PageMemcgKmem(page)) {
> +		struct obj_cgroup *objcg;
> +
> +		objcg = page_objcg(page);
> +		memcg = obj_cgroup_memcg_get(objcg);
> +
> +		page->memcg_data = 0;
> +		obj_cgroup_put(objcg);
> +		nr_kmem = nr_pages;
> +	} else {
> +		memcg = page_memcg(page);
> +		page->memcg_data = 0;
> +		nr_kmem = 0;
> +	}

Why is all this moved above the uncharge_batch() call?

It separates the pointer manipulations from the refcounting, which
makes the code very difficult to follow.

> +
>  	if (ug->memcg != memcg) {
>  		if (ug->memcg) {
>  			uncharge_batch(ug);
>  			uncharge_gather_clear(ug);
>  		}
>  		ug->memcg = memcg;
> +		ug->dummy_page = page;

Why this change?

>  		/* pairs with css_put in uncharge_batch */
>  		css_get(&ug->memcg->css);
>  	}
>  
> -	nr_pages = compound_nr(page);
>  	ug->nr_pages += nr_pages;
> +	ug->nr_kmem += nr_kmem;
> +	ug->pgpgout += !nr_kmem;

Oof.

Yes, this pgpgout line is an equivalent transformation for counting
LRU compound pages. But unless you already know that, it's completely
impossible to understand what the intent here is.

Please avoid clever tricks like this. If you need to check whether the
page is kmem, test PageMemcgKmem() instead of abusing the counters as
boolean flags. This is supposed to be read by human beings, too.

> -	if (PageMemcgKmem(page))
> -		ug->nr_kmem += nr_pages;
> -	else
> -		ug->pgpgout++;
> -
> -	ug->dummy_page = page;
> -	page->memcg_data = 0;
> -	css_put(&ug->memcg->css);
> +	css_put(&memcg->css);

Sorry, these two functions are no longer readable after your changes.

Please retain the following sequence as discrete steps:

1. look up memcg from the page
2. flush existing batch if memcg changed
3. add page's various counts to the current batch
4. clear page->memcg and decrease the referece count to whatever it was pointing to

And as per above, step 3. is where we should check whether to uncharge
the memcg's page counter at all:

	if (PageMemcgKmem(page)) {
		ug->nr_pages += nr_pages;
		ug->nr_kmem += nr_pages;
	} else {
		/* LRU pages aren't accounted at the root level */
		if (!mem_cgroup_is_root(memcg))
			ug->nr_pages += nr_pages;
		ug->pgpgout++;
	}

In fact, it might be a good idea to rename ug->nr_pages to
ug->nr_memory to highlight how it maps to the page_counter.
Muchun Song March 11, 2021, 6:50 a.m. UTC | #3
On Thu, Mar 11, 2021 at 3:53 AM Roman Gushchin <guro@fb.com> wrote:
>
> On Tue, Mar 09, 2021 at 06:07:16PM +0800, Muchun Song wrote:
> > Since Roman series "The new cgroup slab memory controller" applied. All
> > slab objects are charged via the new APIs of obj_cgroup. The new APIs
> > introduce a struct obj_cgroup to charge slab objects. It prevents
> > long-living objects from pinning the original memory cgroup in the memory.
> > But there are still some corner objects (e.g. allocations larger than
> > order-1 page on SLUB) which are not charged via the new APIs. Those
> > objects (include the pages which are allocated from buddy allocator
> > directly) are charged as kmem pages which still hold a reference to
> > the memory cgroup.
> >
> > This patch aims to charge the kmem pages by using the new APIs of
> > obj_cgroup. Finally, the page->memcg_data of the kmem page points to
> > an object cgroup. We can use the page_objcg() to get the object
> > cgroup associated with a kmem page. Or we can use page_memcg_check()
> > to get the memory cgroup associated with a kmem page, but caller must
> > ensure that the returned memcg won't be released (e.g. acquire the
> > rcu_read_lock or css_set_lock).
> >
> > Signed-off-by: Muchun Song <songmuchun@bytedance.com>
> > ---
> >  include/linux/memcontrol.h |  63 ++++++++++++++++++------
> >  mm/memcontrol.c            | 119 ++++++++++++++++++++++++++++++---------------
> >  2 files changed, 128 insertions(+), 54 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 83cbcdcfcc92..07c449af9c0f 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -370,6 +370,18 @@ static inline bool page_memcg_charged(struct page *page)
> >  }
> >
> >  /*
> > + * After the initialization objcg->memcg is always pointing at
> > + * a valid memcg, but can be atomically swapped to the parent memcg.
> > + *
> > + * The caller must ensure that the returned memcg won't be released:
> > + * e.g. acquire the rcu_read_lock or css_set_lock.
> > + */
> > +static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> > +{
> > +     return READ_ONCE(objcg->memcg);
> > +}
> > +
> > +/*
> >   * page_memcg - get the memory cgroup associated with a non-kmem page
> >   * @page: a pointer to the page struct
> >   *
> > @@ -422,15 +434,19 @@ static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
> >   * @page: a pointer to the page struct
> >   *
> >   * Returns a pointer to the memory cgroup associated with the page,
> > - * or NULL. This function unlike page_memcg() can take any  page
> > + * or NULL. This function unlike page_memcg() can take any page
> >   * as an argument. It has to be used in cases when it's not known if a page
> > - * has an associated memory cgroup pointer or an object cgroups vector.
> > + * has an associated memory cgroup pointer or an object cgroups vector or
> > + * an object cgroup.
> >   *
> >   * Any of the following ensures page and memcg binding stability:
> >   * - the page lock
> >   * - LRU isolation
> >   * - lock_page_memcg()
> >   * - exclusive reference
> > + *
> > + * Should be called under rcu lock which can protect memcg associated with a
> > + * kmem page from being released.
>
> How about this:
>
> For a non-kmem page any of the following ensures page and memcg binding stability:
> - the page lock
> - LRU isolation
> - lock_page_memcg()
> - exclusive reference
>
> For a kmem page a caller should hold an rcu read lock to protect memcg associated
> with a kmem page from being released.

OK. I will use this. Thanks Roman.

>
> >   */
> >  static inline struct mem_cgroup *page_memcg_check(struct page *page)
> >  {
> > @@ -443,6 +459,13 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
> >       if (memcg_data & MEMCG_DATA_OBJCGS)
> >               return NULL;
> >
> > +     if (memcg_data & MEMCG_DATA_KMEM) {
> > +             struct obj_cgroup *objcg;
> > +
> > +             objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > +             return obj_cgroup_memcg(objcg);
> > +     }
> > +
> >       return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> >  }
> >
> > @@ -501,6 +524,25 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
> >       return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> >  }
> >
> > +/*
> > + * page_objcg - get the object cgroup associated with a kmem page
> > + * @page: a pointer to the page struct
> > + *
> > + * Returns a pointer to the object cgroup associated with the kmem page,
> > + * or NULL. This function assumes that the page is known to have an
> > + * associated object cgroup. It's only safe to call this function
> > + * against kmem pages (PageMemcgKmem() returns true).
> > + */
> > +static inline struct obj_cgroup *page_objcg(struct page *page)
> > +{
> > +     unsigned long memcg_data = page->memcg_data;
> > +
> > +     VM_BUG_ON_PAGE(PageSlab(page), page);
> > +     VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
> > +     VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
> > +
> > +     return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > +}
> >  #else
> >  static inline struct obj_cgroup **page_objcgs(struct page *page)
> >  {
> > @@ -511,6 +553,11 @@ static inline struct obj_cgroup **page_objcgs_check(struct page *page)
> >  {
> >       return NULL;
> >  }
> > +
> > +static inline struct obj_cgroup *page_objcg(struct page *page)
> > +{
> > +     return NULL;
> > +}
> >  #endif
> >
> >  static __always_inline bool memcg_stat_item_in_bytes(int idx)
> > @@ -729,18 +776,6 @@ static inline void obj_cgroup_put(struct obj_cgroup *objcg)
> >       percpu_ref_put(&objcg->refcnt);
> >  }
> >
> > -/*
> > - * After the initialization objcg->memcg is always pointing at
> > - * a valid memcg, but can be atomically swapped to the parent memcg.
> > - *
> > - * The caller must ensure that the returned memcg won't be released:
> > - * e.g. acquire the rcu_read_lock or css_set_lock.
> > - */
> > -static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
> > -{
> > -     return READ_ONCE(objcg->memcg);
> > -}
> > -
> >  static inline void mem_cgroup_put(struct mem_cgroup *memcg)
> >  {
> >       if (memcg)
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> > index e1dc73ceb98a..38376f9d6659 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -859,15 +859,26 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
> >       pg_data_t *pgdat = page_pgdat(page);
> >       struct lruvec *lruvec;
> >
> > -     memcg = page_memcg_check(head);
> > -     /* Untracked pages have no memcg, no lruvec. Update only the node */
> > -     if (!memcg) {
> > -             __mod_node_page_state(pgdat, idx, val);
> > -             return;
> > +     if (PageMemcgKmem(head)) {
> > +             rcu_read_lock();
> > +             memcg = obj_cgroup_memcg(page_objcg(page));
> > +     } else {
> > +             memcg = page_memcg(head);
> > +             /*
> > +              * Untracked pages have no memcg, no lruvec. Update only the
> > +              * node.
> > +              */
> > +             if (!memcg) {
> > +                     __mod_node_page_state(pgdat, idx, val);
> > +                     return;
> > +             }
> >       }
> >
> >       lruvec = mem_cgroup_lruvec(memcg, pgdat);
> >       __mod_lruvec_state(lruvec, idx, val);
> > +
> > +     if (PageMemcgKmem(head))
> > +             rcu_read_unlock();
> >  }
> >  EXPORT_SYMBOL(__mod_lruvec_page_state);
> >
> > @@ -2906,6 +2917,20 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
> >       page->memcg_data = (unsigned long)memcg;
> >  }
> >
> > +static inline struct mem_cgroup *obj_cgroup_memcg_get(struct obj_cgroup *objcg)
>
> I'd prefer get_obj_cgroup_memcg(), if you don't mind.

LGTM, will do.

>
> > +{
> > +     struct mem_cgroup *memcg;
> > +
> > +     rcu_read_lock();
> > +retry:
> > +     memcg = obj_cgroup_memcg(objcg);
> > +     if (unlikely(!css_tryget(&memcg->css)))
> > +             goto retry;
> > +     rcu_read_unlock();
> > +
> > +     return memcg;
> > +}
> > +
> >  #ifdef CONFIG_MEMCG_KMEM
> >  int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
> >                                gfp_t gfp, bool new_page)
> > @@ -3071,15 +3096,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
> >       struct mem_cgroup *memcg;
> >       int ret;
> >
> > -     rcu_read_lock();
> > -retry:
> > -     memcg = obj_cgroup_memcg(objcg);
> > -     if (unlikely(!css_tryget(&memcg->css)))
> > -             goto retry;
> > -     rcu_read_unlock();
> > -
> > +     memcg = obj_cgroup_memcg_get(objcg);
> >       ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
> > -
> >       css_put(&memcg->css);
> >
> >       return ret;
> > @@ -3144,18 +3162,18 @@ static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
> >   */
> >  int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> >  {
> > -     struct mem_cgroup *memcg;
> > +     struct obj_cgroup *objcg;
> >       int ret = 0;
> >
> > -     memcg = get_mem_cgroup_from_current();
> > -     if (memcg && !mem_cgroup_is_root(memcg)) {
> > -             ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
> > +     objcg = get_obj_cgroup_from_current();
> > +     if (objcg) {
> > +             ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
> >               if (!ret) {
> > -                     page->memcg_data = (unsigned long)memcg |
> > +                     page->memcg_data = (unsigned long)objcg |
> >                               MEMCG_DATA_KMEM;
> >                       return 0;
> >               }
> > -             css_put(&memcg->css);
> > +             obj_cgroup_put(objcg);
> >       }
> >       return ret;
> >  }
> > @@ -3167,17 +3185,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
> >   */
> >  void __memcg_kmem_uncharge_page(struct page *page, int order)
> >  {
> > -     struct mem_cgroup *memcg;
> > +     struct obj_cgroup *objcg;
> >       unsigned int nr_pages = 1 << order;
> >
> >       if (!page_memcg_charged(page))
> >               return;
> >
> > -     memcg = page_memcg_check(page);
> > -     VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
> > -     __memcg_kmem_uncharge(memcg, nr_pages);
> > +     objcg = page_objcg(page);
> > +     obj_cgroup_uncharge_pages(objcg, nr_pages);
> >       page->memcg_data = 0;
> > -     css_put(&memcg->css);
> > +     obj_cgroup_put(objcg);
> >  }
> >
> >  static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> > @@ -6806,11 +6823,23 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
> >  static void uncharge_batch(const struct uncharge_gather *ug)
> >  {
> >       unsigned long flags;
> > +     unsigned long nr_pages;
> >
> > -     if (!mem_cgroup_is_root(ug->memcg)) {
> > -             page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
> > +     /*
> > +      * The kmem pages can be reparented to the root memcg, in
> > +      * order to prevent the memory counter of root memcg from
> > +      * increasing indefinitely. We should decrease the memory
> > +      * counter when unchange.
>
> I guess the correct syntax is
> "The kmem pages can be reparented to the root memcg. In
> order to prevent the memory counter of root memcg from
> increasing indefinitely, we should decrease the memory
> counter when unchange."

Right. I will combine your and Johannes suggestions about
how to rework the code here.

>
> > +      */
> > +     if (mem_cgroup_is_root(ug->memcg))
> > +             nr_pages = ug->nr_kmem;
> > +     else
> > +             nr_pages = ug->nr_pages;
> > +
> > +     if (nr_pages) {
> > +             page_counter_uncharge(&ug->memcg->memory, nr_pages);
> >               if (do_memsw_account())
> > -                     page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
> > +                     page_counter_uncharge(&ug->memcg->memsw, nr_pages);
> >               if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
> >                       page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
> >               memcg_oom_recover(ug->memcg);
> > @@ -6828,7 +6857,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> >
> >  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> >  {
> > -     unsigned long nr_pages;
> > +     unsigned long nr_pages, nr_kmem;
> >       struct mem_cgroup *memcg;
> >
> >       VM_BUG_ON_PAGE(PageLRU(page), page);
> > @@ -6836,34 +6865,44 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> >       if (!page_memcg_charged(page))
> >               return;
> >
> > +     nr_pages = compound_nr(page);
> >       /*
> >        * Nobody should be changing or seriously looking at
> > -      * page memcg at this point, we have fully exclusive
> > -      * access to the page.
> > +      * page memcg or objcg at this point, we have fully
> > +      * exclusive access to the page.
> >        */
> > -     memcg = page_memcg_check(page);
> > +     if (PageMemcgKmem(page)) {
> > +             struct obj_cgroup *objcg;
> > +
> > +             objcg = page_objcg(page);
> > +             memcg = obj_cgroup_memcg_get(objcg);
> > +
> > +             page->memcg_data = 0;
> > +             obj_cgroup_put(objcg);
> > +             nr_kmem = nr_pages;
> > +     } else {
> > +             memcg = page_memcg(page);
> > +             page->memcg_data = 0;
> > +             nr_kmem = 0;
> > +     }
> > +
> >       if (ug->memcg != memcg) {
> >               if (ug->memcg) {
> >                       uncharge_batch(ug);
> >                       uncharge_gather_clear(ug);
> >               }
> >               ug->memcg = memcg;
> > +             ug->dummy_page = page;
> >
> >               /* pairs with css_put in uncharge_batch */
> >               css_get(&ug->memcg->css);
> >       }
> >
> > -     nr_pages = compound_nr(page);
> >       ug->nr_pages += nr_pages;
> > +     ug->nr_kmem += nr_kmem;
> > +     ug->pgpgout += !nr_kmem;
> >
> > -     if (PageMemcgKmem(page))
> > -             ug->nr_kmem += nr_pages;
> > -     else
> > -             ug->pgpgout++;
> > -
> > -     ug->dummy_page = page;
> > -     page->memcg_data = 0;
> > -     css_put(&ug->memcg->css);
> > +     css_put(&memcg->css);
> >  }
> >
> >  /**
> > --
> > 2.11.0
> >
kernel test robot March 11, 2021, 5:48 p.m. UTC | #4
Hi Muchun,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v5.12-rc2 next-20210311]
[cannot apply to hnaz-linux-mm/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/Muchun-Song/Use-obj_cgroup-APIs-to-charge-kmem-pages/20210309-181121
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 144c79ef33536b4ecb4951e07dbc1f2b7fa99d32
config: i386-randconfig-s002-20210311 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-22) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-262-g5e674421-dirty
        # https://github.com/0day-ci/linux/commit/202c922730a115143cf9e15ab26633f247e00229
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Muchun-Song/Use-obj_cgroup-APIs-to-charge-kmem-pages/20210309-181121
        git checkout 202c922730a115143cf9e15ab26633f247e00229
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>


"sparse warnings: (new ones prefixed by >>)"
   mm/memcontrol.c:4194:21: sparse: sparse: incompatible types in comparison expression (different address spaces):
   mm/memcontrol.c:4194:21: sparse:    struct mem_cgroup_threshold_ary [noderef] __rcu *
   mm/memcontrol.c:4194:21: sparse:    struct mem_cgroup_threshold_ary *
   mm/memcontrol.c:4196:21: sparse: sparse: incompatible types in comparison expression (different address spaces):
   mm/memcontrol.c:4196:21: sparse:    struct mem_cgroup_threshold_ary [noderef] __rcu *
   mm/memcontrol.c:4196:21: sparse:    struct mem_cgroup_threshold_ary *
   mm/memcontrol.c:4352:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   mm/memcontrol.c:4352:9: sparse:    struct mem_cgroup_threshold_ary [noderef] __rcu *
   mm/memcontrol.c:4352:9: sparse:    struct mem_cgroup_threshold_ary *
   mm/memcontrol.c:4446:9: sparse: sparse: incompatible types in comparison expression (different address spaces):
   mm/memcontrol.c:4446:9: sparse:    struct mem_cgroup_threshold_ary [noderef] __rcu *
   mm/memcontrol.c:4446:9: sparse:    struct mem_cgroup_threshold_ary *
   mm/memcontrol.c:6002:23: sparse: sparse: incompatible types in comparison expression (different address spaces):
   mm/memcontrol.c:6002:23: sparse:    struct task_struct [noderef] __rcu *
   mm/memcontrol.c:6002:23: sparse:    struct task_struct *
>> mm/memcontrol.c:854:6: sparse: sparse: context imbalance in '__mod_lruvec_page_state' - different lock contexts for basic block
   mm/memcontrol.c: note: in included file:
   include/linux/memcontrol.h:707:9: sparse: sparse: context imbalance in 'lock_page_lruvec' - wrong count at exit
   include/linux/memcontrol.h:707:9: sparse: sparse: context imbalance in 'lock_page_lruvec_irq' - wrong count at exit
   include/linux/memcontrol.h:707:9: sparse: sparse: context imbalance in 'lock_page_lruvec_irqsave' - wrong count at exit
   mm/memcontrol.c:2137:19: sparse: sparse: context imbalance in 'lock_page_memcg' - wrong count at exit
   mm/memcontrol.c:2199:17: sparse: sparse: context imbalance in '__unlock_page_memcg' - unexpected unlock
   mm/memcontrol.c:5853:28: sparse: sparse: context imbalance in 'mem_cgroup_count_precharge_pte_range' - unexpected unlock
   mm/memcontrol.c:6047:36: sparse: sparse: context imbalance in 'mem_cgroup_move_charge_pte_range' - unexpected unlock

vim +/__mod_lruvec_page_state +854 mm/memcontrol.c

eedc4e5a142cc3 Roman Gushchin 2020-08-06  853  
c47d5032ed3002 Shakeel Butt   2020-12-14 @854  void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
c47d5032ed3002 Shakeel Butt   2020-12-14  855  			     int val)
c47d5032ed3002 Shakeel Butt   2020-12-14  856  {
c47d5032ed3002 Shakeel Butt   2020-12-14  857  	struct page *head = compound_head(page); /* rmap on tail pages */
286d6cff5d9973 Muchun Song    2021-03-09  858  	struct mem_cgroup *memcg;
c47d5032ed3002 Shakeel Butt   2020-12-14  859  	pg_data_t *pgdat = page_pgdat(page);
c47d5032ed3002 Shakeel Butt   2020-12-14  860  	struct lruvec *lruvec;
c47d5032ed3002 Shakeel Butt   2020-12-14  861  
202c922730a115 Muchun Song    2021-03-09  862  	if (PageMemcgKmem(head)) {
202c922730a115 Muchun Song    2021-03-09  863  		rcu_read_lock();
202c922730a115 Muchun Song    2021-03-09  864  		memcg = obj_cgroup_memcg(page_objcg(page));
202c922730a115 Muchun Song    2021-03-09  865  	} else {
202c922730a115 Muchun Song    2021-03-09  866  		memcg = page_memcg(head);
202c922730a115 Muchun Song    2021-03-09  867  		/*
202c922730a115 Muchun Song    2021-03-09  868  		 * Untracked pages have no memcg, no lruvec. Update only the
202c922730a115 Muchun Song    2021-03-09  869  		 * node.
202c922730a115 Muchun Song    2021-03-09  870  		 */
d635a69dd4981c Linus Torvalds 2020-12-15  871  		if (!memcg) {
c47d5032ed3002 Shakeel Butt   2020-12-14  872  			__mod_node_page_state(pgdat, idx, val);
c47d5032ed3002 Shakeel Butt   2020-12-14  873  			return;
c47d5032ed3002 Shakeel Butt   2020-12-14  874  		}
202c922730a115 Muchun Song    2021-03-09  875  	}
c47d5032ed3002 Shakeel Butt   2020-12-14  876  
d635a69dd4981c Linus Torvalds 2020-12-15  877  	lruvec = mem_cgroup_lruvec(memcg, pgdat);
c47d5032ed3002 Shakeel Butt   2020-12-14  878  	__mod_lruvec_state(lruvec, idx, val);
202c922730a115 Muchun Song    2021-03-09  879  
202c922730a115 Muchun Song    2021-03-09  880  	if (PageMemcgKmem(head))
202c922730a115 Muchun Song    2021-03-09  881  		rcu_read_unlock();
c47d5032ed3002 Shakeel Butt   2020-12-14  882  }
f0c0c115fb8194 Shakeel Butt   2020-12-14  883  EXPORT_SYMBOL(__mod_lruvec_page_state);
c47d5032ed3002 Shakeel Butt   2020-12-14  884  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Muchun Song March 12, 2021, 9:22 a.m. UTC | #5
On Thu, Mar 11, 2021 at 6:05 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> Hello Munchun,
>
> On Tue, Mar 09, 2021 at 06:07:16PM +0800, Muchun Song wrote:
> > @@ -6806,11 +6823,23 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
> >  static void uncharge_batch(const struct uncharge_gather *ug)
> >  {
> >       unsigned long flags;
> > +     unsigned long nr_pages;
> >
> > -     if (!mem_cgroup_is_root(ug->memcg)) {
> > -             page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
> > +     /*
> > +      * The kmem pages can be reparented to the root memcg, in
> > +      * order to prevent the memory counter of root memcg from
> > +      * increasing indefinitely. We should decrease the memory
> > +      * counter when unchange.
> > +      */
> > +     if (mem_cgroup_is_root(ug->memcg))
> > +             nr_pages = ug->nr_kmem;
> > +     else
> > +             nr_pages = ug->nr_pages;
>
> Correct or not, I find this unreadable. We're uncharging nr_kmem on
> the root, and nr_pages against leaf groups?
>
> It implies several things that might not be immediately obvious to the
> reader of this function. Namely, that nr_kmem is a subset of nr_pages.
> Or that we don't *want* to account LRU pages for the root cgroup.
>
> The old code followed a very simple pattern: the root memcg's page
> counters aren't touched.
>
> This is no longer true: we modify them depending on very specific
> circumstances. But that's too clever for the stupid uncharge_batch()
> which is only supposed to flush a number of accumulators into their
> corresponding page counters.
>
> This distinction really needs to be moved down to uncharge_page() now.

OK. I will rework the code here to make it readable.

>
> > @@ -6828,7 +6857,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> >
> >  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> >  {
> > -     unsigned long nr_pages;
> > +     unsigned long nr_pages, nr_kmem;
> >       struct mem_cgroup *memcg;
> >
> >       VM_BUG_ON_PAGE(PageLRU(page), page);
> > @@ -6836,34 +6865,44 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> >       if (!page_memcg_charged(page))
> >               return;
> >
> > +     nr_pages = compound_nr(page);
> >       /*
> >        * Nobody should be changing or seriously looking at
> > -      * page memcg at this point, we have fully exclusive
> > -      * access to the page.
> > +      * page memcg or objcg at this point, we have fully
> > +      * exclusive access to the page.
> >        */
> > -     memcg = page_memcg_check(page);
> > +     if (PageMemcgKmem(page)) {
> > +             struct obj_cgroup *objcg;
> > +
> > +             objcg = page_objcg(page);
> > +             memcg = obj_cgroup_memcg_get(objcg);
> > +
> > +             page->memcg_data = 0;
> > +             obj_cgroup_put(objcg);
> > +             nr_kmem = nr_pages;
> > +     } else {
> > +             memcg = page_memcg(page);
> > +             page->memcg_data = 0;
> > +             nr_kmem = 0;
> > +     }
>
> Why is all this moved above the uncharge_batch() call?

Before calling obj_cgroup_put(), we need set page->memcg_data
to zero. So I move "page->memcg_data = 0" to here.

>
> It separates the pointer manipulations from the refcounting, which
> makes the code very difficult to follow.
>
> > +
> >       if (ug->memcg != memcg) {
> >               if (ug->memcg) {
> >                       uncharge_batch(ug);
> >                       uncharge_gather_clear(ug);
> >               }
> >               ug->memcg = memcg;
> > +             ug->dummy_page = page;
>
> Why this change?

Just like ug->memcg, we do not need to set
ug->dummy_page in every loop.


>
> >               /* pairs with css_put in uncharge_batch */
> >               css_get(&ug->memcg->css);
> >       }
> >
> > -     nr_pages = compound_nr(page);
> >       ug->nr_pages += nr_pages;
> > +     ug->nr_kmem += nr_kmem;
> > +     ug->pgpgout += !nr_kmem;
>
> Oof.
>
> Yes, this pgpgout line is an equivalent transformation for counting
> LRU compound pages. But unless you already know that, it's completely
> impossible to understand what the intent here is.
>
> Please avoid clever tricks like this. If you need to check whether the
> page is kmem, test PageMemcgKmem() instead of abusing the counters as
> boolean flags. This is supposed to be read by human beings, too.

Got it.

>
> > -     if (PageMemcgKmem(page))
> > -             ug->nr_kmem += nr_pages;
> > -     else
> > -             ug->pgpgout++;
> > -
> > -     ug->dummy_page = page;
> > -     page->memcg_data = 0;
> > -     css_put(&ug->memcg->css);
> > +     css_put(&memcg->css);
>
> Sorry, these two functions are no longer readable after your changes.
>
> Please retain the following sequence as discrete steps:
>
> 1. look up memcg from the page
> 2. flush existing batch if memcg changed
> 3. add page's various counts to the current batch
> 4. clear page->memcg and decrease the referece count to whatever it was pointing to

Got it.

>
> And as per above, step 3. is where we should check whether to uncharge
> the memcg's page counter at all:
>
>         if (PageMemcgKmem(page)) {
>                 ug->nr_pages += nr_pages;
>                 ug->nr_kmem += nr_pages;
>         } else {
>                 /* LRU pages aren't accounted at the root level */
>                 if (!mem_cgroup_is_root(memcg))
>                         ug->nr_pages += nr_pages;
>                 ug->pgpgout++;
>         }
>
> In fact, it might be a good idea to rename ug->nr_pages to
> ug->nr_memory to highlight how it maps to the page_counter.

I will rework the code in the next version. Thanks for your
suggestions.
Johannes Weiner March 12, 2021, 3:59 p.m. UTC | #6
On Fri, Mar 12, 2021 at 05:22:55PM +0800, Muchun Song wrote:
> On Thu, Mar 11, 2021 at 6:05 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > @@ -6828,7 +6857,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> > >
> > >  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > >  {
> > > -     unsigned long nr_pages;
> > > +     unsigned long nr_pages, nr_kmem;
> > >       struct mem_cgroup *memcg;
> > >
> > >       VM_BUG_ON_PAGE(PageLRU(page), page);
> > > @@ -6836,34 +6865,44 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > >       if (!page_memcg_charged(page))
> > >               return;
> > >
> > > +     nr_pages = compound_nr(page);
> > >       /*
> > >        * Nobody should be changing or seriously looking at
> > > -      * page memcg at this point, we have fully exclusive
> > > -      * access to the page.
> > > +      * page memcg or objcg at this point, we have fully
> > > +      * exclusive access to the page.
> > >        */
> > > -     memcg = page_memcg_check(page);
> > > +     if (PageMemcgKmem(page)) {
> > > +             struct obj_cgroup *objcg;
> > > +
> > > +             objcg = page_objcg(page);
> > > +             memcg = obj_cgroup_memcg_get(objcg);
> > > +
> > > +             page->memcg_data = 0;
> > > +             obj_cgroup_put(objcg);
> > > +             nr_kmem = nr_pages;
> > > +     } else {
> > > +             memcg = page_memcg(page);
> > > +             page->memcg_data = 0;
> > > +             nr_kmem = 0;
> > > +     }
> >
> > Why is all this moved above the uncharge_batch() call?
> 
> Before calling obj_cgroup_put(), we need set page->memcg_data
> to zero. So I move "page->memcg_data = 0" to here.

Yeah, it makes sense to keep those together, but we can move them both
down to after the uncharge, right?

> > It separates the pointer manipulations from the refcounting, which
> > makes the code very difficult to follow.
> >
> > > +
> > >       if (ug->memcg != memcg) {
> > >               if (ug->memcg) {
> > >                       uncharge_batch(ug);
> > >                       uncharge_gather_clear(ug);
> > >               }
> > >               ug->memcg = memcg;
> > > +             ug->dummy_page = page;
> >
> > Why this change?
> 
> Just like ug->memcg, we do not need to set
> ug->dummy_page in every loop.

Ah, okay. That's a reasonable change, it's just confusing because I
thought this was a requirement for the new code to work. But I didn't
see how it relied on that, and it made me think I'm not understanding
your code ;) It's better to split that into a separate patch.

> I will rework the code in the next version.

Thanks!
Muchun Song March 12, 2021, 4:46 p.m. UTC | #7
On Fri, Mar 12, 2021 at 11:59 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Fri, Mar 12, 2021 at 05:22:55PM +0800, Muchun Song wrote:
> > On Thu, Mar 11, 2021 at 6:05 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> > > > @@ -6828,7 +6857,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
> > > >
> > > >  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > > >  {
> > > > -     unsigned long nr_pages;
> > > > +     unsigned long nr_pages, nr_kmem;
> > > >       struct mem_cgroup *memcg;
> > > >
> > > >       VM_BUG_ON_PAGE(PageLRU(page), page);
> > > > @@ -6836,34 +6865,44 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
> > > >       if (!page_memcg_charged(page))
> > > >               return;
> > > >
> > > > +     nr_pages = compound_nr(page);
> > > >       /*
> > > >        * Nobody should be changing or seriously looking at
> > > > -      * page memcg at this point, we have fully exclusive
> > > > -      * access to the page.
> > > > +      * page memcg or objcg at this point, we have fully
> > > > +      * exclusive access to the page.
> > > >        */
> > > > -     memcg = page_memcg_check(page);
> > > > +     if (PageMemcgKmem(page)) {
> > > > +             struct obj_cgroup *objcg;
> > > > +
> > > > +             objcg = page_objcg(page);
> > > > +             memcg = obj_cgroup_memcg_get(objcg);
> > > > +
> > > > +             page->memcg_data = 0;
> > > > +             obj_cgroup_put(objcg);
> > > > +             nr_kmem = nr_pages;
> > > > +     } else {
> > > > +             memcg = page_memcg(page);
> > > > +             page->memcg_data = 0;
> > > > +             nr_kmem = 0;
> > > > +     }
> > >
> > > Why is all this moved above the uncharge_batch() call?
> >
> > Before calling obj_cgroup_put(), we need set page->memcg_data
> > to zero. So I move "page->memcg_data = 0" to here.
>
> Yeah, it makes sense to keep those together, but we can move them both
> down to after the uncharge, right?

Right. I am doing this.

>
> > > It separates the pointer manipulations from the refcounting, which
> > > makes the code very difficult to follow.
> > >
> > > > +
> > > >       if (ug->memcg != memcg) {
> > > >               if (ug->memcg) {
> > > >                       uncharge_batch(ug);
> > > >                       uncharge_gather_clear(ug);
> > > >               }
> > > >               ug->memcg = memcg;
> > > > +             ug->dummy_page = page;
> > >
> > > Why this change?
> >
> > Just like ug->memcg, we do not need to set
> > ug->dummy_page in every loop.
>
> Ah, okay. That's a reasonable change, it's just confusing because I
> thought this was a requirement for the new code to work. But I didn't
> see how it relied on that, and it made me think I'm not understanding
> your code ;) It's better to split that into a separate patch.

Sorry for confusing you. I will split that into a separate patch.
Thanks.

>
> > I will rework the code in the next version.
>
> Thanks!
diff mbox series

Patch

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 83cbcdcfcc92..07c449af9c0f 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -370,6 +370,18 @@  static inline bool page_memcg_charged(struct page *page)
 }
 
 /*
+ * After the initialization objcg->memcg is always pointing at
+ * a valid memcg, but can be atomically swapped to the parent memcg.
+ *
+ * The caller must ensure that the returned memcg won't be released:
+ * e.g. acquire the rcu_read_lock or css_set_lock.
+ */
+static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
+{
+	return READ_ONCE(objcg->memcg);
+}
+
+/*
  * page_memcg - get the memory cgroup associated with a non-kmem page
  * @page: a pointer to the page struct
  *
@@ -422,15 +434,19 @@  static inline struct mem_cgroup *page_memcg_rcu(struct page *page)
  * @page: a pointer to the page struct
  *
  * Returns a pointer to the memory cgroup associated with the page,
- * or NULL. This function unlike page_memcg() can take any  page
+ * or NULL. This function unlike page_memcg() can take any page
  * as an argument. It has to be used in cases when it's not known if a page
- * has an associated memory cgroup pointer or an object cgroups vector.
+ * has an associated memory cgroup pointer or an object cgroups vector or
+ * an object cgroup.
  *
  * Any of the following ensures page and memcg binding stability:
  * - the page lock
  * - LRU isolation
  * - lock_page_memcg()
  * - exclusive reference
+ *
+ * Should be called under rcu lock which can protect memcg associated with a
+ * kmem page from being released.
  */
 static inline struct mem_cgroup *page_memcg_check(struct page *page)
 {
@@ -443,6 +459,13 @@  static inline struct mem_cgroup *page_memcg_check(struct page *page)
 	if (memcg_data & MEMCG_DATA_OBJCGS)
 		return NULL;
 
+	if (memcg_data & MEMCG_DATA_KMEM) {
+		struct obj_cgroup *objcg;
+
+		objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+		return obj_cgroup_memcg(objcg);
+	}
+
 	return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
@@ -501,6 +524,25 @@  static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 	return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
 }
 
+/*
+ * page_objcg - get the object cgroup associated with a kmem page
+ * @page: a pointer to the page struct
+ *
+ * Returns a pointer to the object cgroup associated with the kmem page,
+ * or NULL. This function assumes that the page is known to have an
+ * associated object cgroup. It's only safe to call this function
+ * against kmem pages (PageMemcgKmem() returns true).
+ */
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+	unsigned long memcg_data = page->memcg_data;
+
+	VM_BUG_ON_PAGE(PageSlab(page), page);
+	VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_OBJCGS, page);
+	VM_BUG_ON_PAGE(!(memcg_data & MEMCG_DATA_KMEM), page);
+
+	return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
+}
 #else
 static inline struct obj_cgroup **page_objcgs(struct page *page)
 {
@@ -511,6 +553,11 @@  static inline struct obj_cgroup **page_objcgs_check(struct page *page)
 {
 	return NULL;
 }
+
+static inline struct obj_cgroup *page_objcg(struct page *page)
+{
+	return NULL;
+}
 #endif
 
 static __always_inline bool memcg_stat_item_in_bytes(int idx)
@@ -729,18 +776,6 @@  static inline void obj_cgroup_put(struct obj_cgroup *objcg)
 	percpu_ref_put(&objcg->refcnt);
 }
 
-/*
- * After the initialization objcg->memcg is always pointing at
- * a valid memcg, but can be atomically swapped to the parent memcg.
- *
- * The caller must ensure that the returned memcg won't be released:
- * e.g. acquire the rcu_read_lock or css_set_lock.
- */
-static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg)
-{
-	return READ_ONCE(objcg->memcg);
-}
-
 static inline void mem_cgroup_put(struct mem_cgroup *memcg)
 {
 	if (memcg)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e1dc73ceb98a..38376f9d6659 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -859,15 +859,26 @@  void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 	pg_data_t *pgdat = page_pgdat(page);
 	struct lruvec *lruvec;
 
-	memcg = page_memcg_check(head);
-	/* Untracked pages have no memcg, no lruvec. Update only the node */
-	if (!memcg) {
-		__mod_node_page_state(pgdat, idx, val);
-		return;
+	if (PageMemcgKmem(head)) {
+		rcu_read_lock();
+		memcg = obj_cgroup_memcg(page_objcg(page));
+	} else {
+		memcg = page_memcg(head);
+		/*
+		 * Untracked pages have no memcg, no lruvec. Update only the
+		 * node.
+		 */
+		if (!memcg) {
+			__mod_node_page_state(pgdat, idx, val);
+			return;
+		}
 	}
 
 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
 	__mod_lruvec_state(lruvec, idx, val);
+
+	if (PageMemcgKmem(head))
+		rcu_read_unlock();
 }
 EXPORT_SYMBOL(__mod_lruvec_page_state);
 
@@ -2906,6 +2917,20 @@  static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 	page->memcg_data = (unsigned long)memcg;
 }
 
+static inline struct mem_cgroup *obj_cgroup_memcg_get(struct obj_cgroup *objcg)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+retry:
+	memcg = obj_cgroup_memcg(objcg);
+	if (unlikely(!css_tryget(&memcg->css)))
+		goto retry;
+	rcu_read_unlock();
+
+	return memcg;
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
 				 gfp_t gfp, bool new_page)
@@ -3071,15 +3096,8 @@  static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
 	struct mem_cgroup *memcg;
 	int ret;
 
-	rcu_read_lock();
-retry:
-	memcg = obj_cgroup_memcg(objcg);
-	if (unlikely(!css_tryget(&memcg->css)))
-		goto retry;
-	rcu_read_unlock();
-
+	memcg = obj_cgroup_memcg_get(objcg);
 	ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
-
 	css_put(&memcg->css);
 
 	return ret;
@@ -3144,18 +3162,18 @@  static void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_page
  */
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 {
-	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
 	int ret = 0;
 
-	memcg = get_mem_cgroup_from_current();
-	if (memcg && !mem_cgroup_is_root(memcg)) {
-		ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
+	objcg = get_obj_cgroup_from_current();
+	if (objcg) {
+		ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
 		if (!ret) {
-			page->memcg_data = (unsigned long)memcg |
+			page->memcg_data = (unsigned long)objcg |
 				MEMCG_DATA_KMEM;
 			return 0;
 		}
-		css_put(&memcg->css);
+		obj_cgroup_put(objcg);
 	}
 	return ret;
 }
@@ -3167,17 +3185,16 @@  int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
  */
 void __memcg_kmem_uncharge_page(struct page *page, int order)
 {
-	struct mem_cgroup *memcg;
+	struct obj_cgroup *objcg;
 	unsigned int nr_pages = 1 << order;
 
 	if (!page_memcg_charged(page))
 		return;
 
-	memcg = page_memcg_check(page);
-	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-	__memcg_kmem_uncharge(memcg, nr_pages);
+	objcg = page_objcg(page);
+	obj_cgroup_uncharge_pages(objcg, nr_pages);
 	page->memcg_data = 0;
-	css_put(&memcg->css);
+	obj_cgroup_put(objcg);
 }
 
 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -6806,11 +6823,23 @@  static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 static void uncharge_batch(const struct uncharge_gather *ug)
 {
 	unsigned long flags;
+	unsigned long nr_pages;
 
-	if (!mem_cgroup_is_root(ug->memcg)) {
-		page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
+	/*
+	 * The kmem pages can be reparented to the root memcg, in
+	 * order to prevent the memory counter of root memcg from
+	 * increasing indefinitely. We should decrease the memory
+	 * counter when unchange.
+	 */
+	if (mem_cgroup_is_root(ug->memcg))
+		nr_pages = ug->nr_kmem;
+	else
+		nr_pages = ug->nr_pages;
+
+	if (nr_pages) {
+		page_counter_uncharge(&ug->memcg->memory, nr_pages);
 		if (do_memsw_account())
-			page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
+			page_counter_uncharge(&ug->memcg->memsw, nr_pages);
 		if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
 			page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
 		memcg_oom_recover(ug->memcg);
@@ -6828,7 +6857,7 @@  static void uncharge_batch(const struct uncharge_gather *ug)
 
 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 {
-	unsigned long nr_pages;
+	unsigned long nr_pages, nr_kmem;
 	struct mem_cgroup *memcg;
 
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -6836,34 +6865,44 @@  static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 	if (!page_memcg_charged(page))
 		return;
 
+	nr_pages = compound_nr(page);
 	/*
 	 * Nobody should be changing or seriously looking at
-	 * page memcg at this point, we have fully exclusive
-	 * access to the page.
+	 * page memcg or objcg at this point, we have fully
+	 * exclusive access to the page.
 	 */
-	memcg = page_memcg_check(page);
+	if (PageMemcgKmem(page)) {
+		struct obj_cgroup *objcg;
+
+		objcg = page_objcg(page);
+		memcg = obj_cgroup_memcg_get(objcg);
+
+		page->memcg_data = 0;
+		obj_cgroup_put(objcg);
+		nr_kmem = nr_pages;
+	} else {
+		memcg = page_memcg(page);
+		page->memcg_data = 0;
+		nr_kmem = 0;
+	}
+
 	if (ug->memcg != memcg) {
 		if (ug->memcg) {
 			uncharge_batch(ug);
 			uncharge_gather_clear(ug);
 		}
 		ug->memcg = memcg;
+		ug->dummy_page = page;
 
 		/* pairs with css_put in uncharge_batch */
 		css_get(&ug->memcg->css);
 	}
 
-	nr_pages = compound_nr(page);
 	ug->nr_pages += nr_pages;
+	ug->nr_kmem += nr_kmem;
+	ug->pgpgout += !nr_kmem;
 
-	if (PageMemcgKmem(page))
-		ug->nr_kmem += nr_pages;
-	else
-		ug->pgpgout++;
-
-	ug->dummy_page = page;
-	page->memcg_data = 0;
-	css_put(&ug->memcg->css);
+	css_put(&memcg->css);
 }
 
 /**