diff mbox series

[2/3] mm: Charge active memcg when no mm is set

Message ID 20210329144829.1834347-3-schatzberg.dan@gmail.com (mailing list archive)
State New, archived
Headers show
Series Charge loop device i/o to issuing cgroup | expand

Commit Message

Dan Schatzberg March 29, 2021, 2:48 p.m. UTC
set_active_memcg() worked for kernel allocations but was silently
ignored for user pages.

This patch establishes a precedence order for who gets charged:

1. If there is a memcg associated with the page already, that memcg is
   charged. This happens during swapin.

2. If an explicit mm is passed, mm->memcg is charged. This happens
   during page faults, which can be triggered in remote VMs (eg gup).

3. Otherwise consult the current process context. If there is an
   active_memcg, use that. Otherwise, current->mm->memcg.

Previously, if a NULL mm was passed to mem_cgroup_charge (case 3) it
would always charge the root cgroup. Now it looks up the active_memcg
first (falling back to charging the root cgroup if not set).

Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Chris Down <chris@chrisdown.name>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
---
 mm/filemap.c    |  2 +-
 mm/memcontrol.c | 72 ++++++++++++++++++++++++++++---------------------
 mm/shmem.c      |  4 +--
 3 files changed, 44 insertions(+), 34 deletions(-)

Comments

Muchun Song March 29, 2021, 4:13 p.m. UTC | #1
On Mon, Mar 29, 2021 at 10:49 PM Dan Schatzberg
<schatzberg.dan@gmail.com> wrote:
>
> set_active_memcg() worked for kernel allocations but was silently
> ignored for user pages.
>
> This patch establishes a precedence order for who gets charged:
>
> 1. If there is a memcg associated with the page already, that memcg is
>    charged. This happens during swapin.
>
> 2. If an explicit mm is passed, mm->memcg is charged. This happens
>    during page faults, which can be triggered in remote VMs (eg gup).
>
> 3. Otherwise consult the current process context. If there is an
>    active_memcg, use that. Otherwise, current->mm->memcg.
>
> Previously, if a NULL mm was passed to mem_cgroup_charge (case 3) it
> would always charge the root cgroup. Now it looks up the active_memcg
> first (falling back to charging the root cgroup if not set).
>
> Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> Acked-by: Tejun Heo <tj@kernel.org>
> Acked-by: Chris Down <chris@chrisdown.name>
> Reviewed-by: Shakeel Butt <shakeelb@google.com>
> ---
>  mm/filemap.c    |  2 +-
>  mm/memcontrol.c | 72 ++++++++++++++++++++++++++++---------------------
>  mm/shmem.c      |  4 +--
>  3 files changed, 44 insertions(+), 34 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index eeeb8e2cc36a..63fd980e863a 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -872,7 +872,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
>         page->index = offset;
>
>         if (!huge) {
> -               error = mem_cgroup_charge(page, current->mm, gfp);
> +               error = mem_cgroup_charge(page, NULL, gfp);
>                 if (error)
>                         goto error;
>                 charged = true;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 668d1d7c2645..adc618814fd2 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -884,13 +884,38 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
>  }
>  EXPORT_SYMBOL(mem_cgroup_from_task);
>
> +static __always_inline struct mem_cgroup *active_memcg(void)
> +{
> +       if (in_interrupt())
> +               return this_cpu_read(int_active_memcg);
> +       else
> +               return current->active_memcg;
> +}
> +
> +static __always_inline struct mem_cgroup *get_active_memcg(void)
> +{
> +       struct mem_cgroup *memcg;
> +
> +       rcu_read_lock();
> +       memcg = active_memcg();
> +       /* remote memcg must hold a ref. */
> +       if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
> +               memcg = root_mem_cgroup;
> +       rcu_read_unlock();
> +
> +       return memcg;
> +}

This function is already removed since the patchset below.

  Use obj_cgroup APIs to charge kmem pages
  https://lore.kernel.org/patchwork/cover/1399132/

I also suggest not reintroducing get_active_memcg.
There is only one user of it, just inline it into
get_mem_cgroup_from_mm(). Actually we don’t
need get_active_memcg() either.

> +
>  /**
>   * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
>   * @mm: mm from which memcg should be extracted. It can be NULL.
>   *
> - * Obtain a reference on mm->memcg and returns it if successful. Otherwise
> - * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
> - * returned.
> + * Obtain a reference on mm->memcg and returns it if successful. If mm
> + * is NULL, then the memcg is chosen as follows:
> + * 1) The active memcg, if set.
> + * 2) current->mm->memcg, if available
> + * 3) root memcg
> + * If mem_cgroup is disabled, NULL is returned.
>   */
>  struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
>  {
> @@ -899,13 +924,19 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
>         if (mem_cgroup_disabled())
>                 return NULL;
>
> +       /*
> +        * Page cache insertions can happen without an
> +        * actual mm context, e.g. during disk probing
> +        * on boot, loopback IO, acct() writes etc.
> +        */
> +       if (unlikely(!mm)) {
> +               if (unlikely(active_memcg()))
> +                       return get_active_memcg();

Since remote memcg must hold a reference, we do not
need to do something like get_active_memcg() does.
Just use css_get to obtain a ref, it is simpler. Just
Like below.

+       if (unlikely(!mm)) {
+               memcg = active_memcg();
+               if (unlikely(memcg)) {
+                       /* remote memcg must hold a ref. */
+                       css_get(memcg);
+                       return memcg;
+               }

Thanks.

> +               mm = current->mm;
> +       }
> +
>         rcu_read_lock();
>         do {
> -               /*
> -                * Page cache insertions can happen withou an
> -                * actual mm context, e.g. during disk probing
> -                * on boot, loopback IO, acct() writes etc.
> -                */
>                 if (unlikely(!mm))
>                         memcg = root_mem_cgroup;
>                 else {
> @@ -919,28 +950,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
>  }
>  EXPORT_SYMBOL(get_mem_cgroup_from_mm);
>
> -static __always_inline struct mem_cgroup *active_memcg(void)
> -{
> -       if (in_interrupt())
> -               return this_cpu_read(int_active_memcg);
> -       else
> -               return current->active_memcg;
> -}
> -
> -static __always_inline struct mem_cgroup *get_active_memcg(void)
> -{
> -       struct mem_cgroup *memcg;
> -
> -       rcu_read_lock();
> -       memcg = active_memcg();
> -       /* remote memcg must hold a ref. */
> -       if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
> -               memcg = root_mem_cgroup;
> -       rcu_read_unlock();
> -
> -       return memcg;
> -}
> -
>  static __always_inline bool memcg_kmem_bypass(void)
>  {
>         /* Allow remote memcg charging from any context. */
> @@ -6549,7 +6558,8 @@ static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
>   * @gfp_mask: reclaim mode
>   *
>   * Try to charge @page to the memcg that @mm belongs to, reclaiming
> - * pages according to @gfp_mask if necessary.
> + * pages according to @gfp_mask if necessary. if @mm is NULL, try to
> + * charge to the active memcg.
>   *
>   * Do not use this for pages allocated for swapin.
>   *
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 78ab81a62b29..7c09276125d5 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1694,7 +1694,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
>  {
>         struct address_space *mapping = inode->i_mapping;
>         struct shmem_inode_info *info = SHMEM_I(inode);
> -       struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
> +       struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
>         struct page *page;
>         swp_entry_t swap;
>         int error;
> @@ -1815,7 +1815,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
>         }
>
>         sbinfo = SHMEM_SB(inode->i_sb);
> -       charge_mm = vma ? vma->vm_mm : current->mm;
> +       charge_mm = vma ? vma->vm_mm : NULL;
>
>         page = pagecache_get_page(mapping, index,
>                                         FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);
> --
> 2.30.2
>
Shakeel Butt March 29, 2021, 8:59 p.m. UTC | #2
On Mon, Mar 29, 2021 at 9:13 AM Muchun Song <songmuchun@bytedance.com> wrote:
>
> On Mon, Mar 29, 2021 at 10:49 PM Dan Schatzberg
> <schatzberg.dan@gmail.com> wrote:
[...]
>
> Since remote memcg must hold a reference, we do not
> need to do something like get_active_memcg() does.
> Just use css_get to obtain a ref, it is simpler. Just
> Like below.
>
> +       if (unlikely(!mm)) {
> +               memcg = active_memcg();
> +               if (unlikely(memcg)) {
> +                       /* remote memcg must hold a ref. */
> +                       css_get(memcg);
> +                       return memcg;
> +               }
>

I second Muchun's suggestion.
diff mbox series

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index eeeb8e2cc36a..63fd980e863a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -872,7 +872,7 @@  noinline int __add_to_page_cache_locked(struct page *page,
 	page->index = offset;
 
 	if (!huge) {
-		error = mem_cgroup_charge(page, current->mm, gfp);
+		error = mem_cgroup_charge(page, NULL, gfp);
 		if (error)
 			goto error;
 		charged = true;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 668d1d7c2645..adc618814fd2 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -884,13 +884,38 @@  struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 }
 EXPORT_SYMBOL(mem_cgroup_from_task);
 
+static __always_inline struct mem_cgroup *active_memcg(void)
+{
+	if (in_interrupt())
+		return this_cpu_read(int_active_memcg);
+	else
+		return current->active_memcg;
+}
+
+static __always_inline struct mem_cgroup *get_active_memcg(void)
+{
+	struct mem_cgroup *memcg;
+
+	rcu_read_lock();
+	memcg = active_memcg();
+	/* remote memcg must hold a ref. */
+	if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
+		memcg = root_mem_cgroup;
+	rcu_read_unlock();
+
+	return memcg;
+}
+
 /**
  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
  * @mm: mm from which memcg should be extracted. It can be NULL.
  *
- * Obtain a reference on mm->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
- * returned.
+ * Obtain a reference on mm->memcg and returns it if successful. If mm
+ * is NULL, then the memcg is chosen as follows:
+ * 1) The active memcg, if set.
+ * 2) current->mm->memcg, if available
+ * 3) root memcg
+ * If mem_cgroup is disabled, NULL is returned.
  */
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
@@ -899,13 +924,19 @@  struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 	if (mem_cgroup_disabled())
 		return NULL;
 
+	/*
+	 * Page cache insertions can happen without an
+	 * actual mm context, e.g. during disk probing
+	 * on boot, loopback IO, acct() writes etc.
+	 */
+	if (unlikely(!mm)) {
+		if (unlikely(active_memcg()))
+			return get_active_memcg();
+		mm = current->mm;
+	}
+
 	rcu_read_lock();
 	do {
-		/*
-		 * Page cache insertions can happen withou an
-		 * actual mm context, e.g. during disk probing
-		 * on boot, loopback IO, acct() writes etc.
-		 */
 		if (unlikely(!mm))
 			memcg = root_mem_cgroup;
 		else {
@@ -919,28 +950,6 @@  struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 
-static __always_inline struct mem_cgroup *active_memcg(void)
-{
-	if (in_interrupt())
-		return this_cpu_read(int_active_memcg);
-	else
-		return current->active_memcg;
-}
-
-static __always_inline struct mem_cgroup *get_active_memcg(void)
-{
-	struct mem_cgroup *memcg;
-
-	rcu_read_lock();
-	memcg = active_memcg();
-	/* remote memcg must hold a ref. */
-	if (memcg && WARN_ON_ONCE(!css_tryget(&memcg->css)))
-		memcg = root_mem_cgroup;
-	rcu_read_unlock();
-
-	return memcg;
-}
-
 static __always_inline bool memcg_kmem_bypass(void)
 {
 	/* Allow remote memcg charging from any context. */
@@ -6549,7 +6558,8 @@  static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
  * @gfp_mask: reclaim mode
  *
  * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary.
+ * pages according to @gfp_mask if necessary. if @mm is NULL, try to
+ * charge to the active memcg.
  *
  * Do not use this for pages allocated for swapin.
  *
diff --git a/mm/shmem.c b/mm/shmem.c
index 78ab81a62b29..7c09276125d5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1694,7 +1694,7 @@  static int shmem_swapin_page(struct inode *inode, pgoff_t index,
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info = SHMEM_I(inode);
-	struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+	struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
 	struct page *page;
 	swp_entry_t swap;
 	int error;
@@ -1815,7 +1815,7 @@  static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	}
 
 	sbinfo = SHMEM_SB(inode->i_sb);
-	charge_mm = vma ? vma->vm_mm : current->mm;
+	charge_mm = vma ? vma->vm_mm : NULL;
 
 	page = pagecache_get_page(mapping, index,
 					FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);