diff mbox series

[v5,05/10] mm/swap: charge the page when adding to the swap cache

Message ID 1585892447-32059-6-git-send-email-iamjoonsoo.kim@lge.com (mailing list archive)
State New, archived
Headers show
Series workingset protection/detection on the anonymous LRU list | expand

Commit Message

Joonsoo Kim April 3, 2020, 5:40 a.m. UTC
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

Currently, some swapped-in pages are not charged to the memcg until
actual access to the page happens. I checked the code and found that
it could cause a problem. In this implementation, even if the memcg
is enabled, one can consume a lot of memory in the system by exploiting
this hole. For example, one can make all the pages swapped out and
then call madvise_willneed() to load the all swapped-out pages without
pressing the memcg. Although actual access requires charging, it's really
big benefit to load the swapped-out pages to the memory without pressing
the memcg.

And, for workingset detection which is implemented on the following patch,
a memcg should be committed before the workingset detection is executed.
For this purpose, the best solution, I think, is charging the page when
adding to the swap cache. Charging there is not that hard. Caller of
adding the page to the swap cache has enough information about the charged
memcg. So, what we need to do is just passing this information to
the right place.

With this patch, specific memcg could be pressured more since readahead
pages are also charged to it now. This would result in performance
degradation to that user but it would be fair since that readahead is for
that user.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 include/linux/swap.h |  4 ++--
 mm/shmem.c           | 18 ++++++++++--------
 mm/swap_state.c      | 25 +++++++++++++++++++++----
 3 files changed, 33 insertions(+), 14 deletions(-)

Comments

Yang Shi April 3, 2020, 6:29 p.m. UTC | #1
On Thu, Apr 2, 2020 at 10:41 PM <js1304@gmail.com> wrote:
>
> From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
>
> Currently, some swapped-in pages are not charged to the memcg until
> actual access to the page happens. I checked the code and found that
> it could cause a problem. In this implementation, even if the memcg
> is enabled, one can consume a lot of memory in the system by exploiting
> this hole. For example, one can make all the pages swapped out and
> then call madvise_willneed() to load the all swapped-out pages without
> pressing the memcg. Although actual access requires charging, it's really
> big benefit to load the swapped-out pages to the memory without pressing
> the memcg.
>
> And, for workingset detection which is implemented on the following patch,
> a memcg should be committed before the workingset detection is executed.
> For this purpose, the best solution, I think, is charging the page when
> adding to the swap cache. Charging there is not that hard. Caller of
> adding the page to the swap cache has enough information about the charged
> memcg. So, what we need to do is just passing this information to
> the right place.
>
> With this patch, specific memcg could be pressured more since readahead
> pages are also charged to it now. This would result in performance
> degradation to that user but it would be fair since that readahead is for
> that user.

If I read the code correctly, the readahead pages may be *not* charged
to it at all but other memcgs since mem_cgroup_try_charge() would
retrieve the target memcg id from the swap entry then charge to it
(generally it is the memcg from who the page is swapped out). So, it
may open a backdoor to let one memcg stress other memcgs?

>
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> ---
>  include/linux/swap.h |  4 ++--
>  mm/shmem.c           | 18 ++++++++++--------
>  mm/swap_state.c      | 25 +++++++++++++++++++++----
>  3 files changed, 33 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 273de48..eea0700 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -409,7 +409,7 @@ extern unsigned long total_swapcache_pages(void);
>  extern void show_swap_cache_info(void);
>  extern int add_to_swap(struct page *page);
>  extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
> -                       gfp_t gfp, void **shadowp);
> +                       struct vm_area_struct *vma, gfp_t gfp, void **shadowp);
>  extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
>  extern void __delete_from_swap_cache(struct page *page,
>                         swp_entry_t entry, void *shadow);
> @@ -567,7 +567,7 @@ static inline int add_to_swap(struct page *page)
>  }
>
>  static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
> -                                       gfp_t gfp_mask, void **shadowp)
> +                       struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
>  {
>         return -1;
>  }
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9e34b4e..8e28c1f 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1369,7 +1369,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
>         if (list_empty(&info->swaplist))
>                 list_add(&info->swaplist, &shmem_swaplist);
>
> -       if (add_to_swap_cache(page, swap,
> +       if (add_to_swap_cache(page, swap, NULL,
>                         __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
>                         NULL) == 0) {
>                 spin_lock_irq(&info->lock);
> @@ -1434,10 +1434,11 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
>  #endif
>
>  static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
> -               struct shmem_inode_info *info, pgoff_t index)
> +                       struct mm_struct *mm, struct shmem_inode_info *info,
> +                       pgoff_t index)
>  {
>         /* Create a pseudo vma that just contains the policy */
> -       vma_init(vma, NULL);
> +       vma_init(vma, mm);
>         /* Bias interleave by inode number to distribute better across nodes */
>         vma->vm_pgoff = index + info->vfs_inode.i_ino;
>         vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
> @@ -1450,13 +1451,14 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
>  }
>
>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
> -                       struct shmem_inode_info *info, pgoff_t index)
> +                       struct mm_struct *mm, struct shmem_inode_info *info,
> +                       pgoff_t index)
>  {
>         struct vm_area_struct pvma;
>         struct page *page;
>         struct vm_fault vmf;
>
> -       shmem_pseudo_vma_init(&pvma, info, index);
> +       shmem_pseudo_vma_init(&pvma, mm, info, index);
>         vmf.vma = &pvma;
>         vmf.address = 0;
>         page = swap_cluster_readahead(swap, gfp, &vmf);
> @@ -1481,7 +1483,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
>                                                                 XA_PRESENT))
>                 return NULL;
>
> -       shmem_pseudo_vma_init(&pvma, info, hindex);
> +       shmem_pseudo_vma_init(&pvma, NULL, info, hindex);
>         page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
>                         HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
>         shmem_pseudo_vma_destroy(&pvma);
> @@ -1496,7 +1498,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
>         struct vm_area_struct pvma;
>         struct page *page;
>
> -       shmem_pseudo_vma_init(&pvma, info, index);
> +       shmem_pseudo_vma_init(&pvma, NULL, info, index);
>         page = alloc_page_vma(gfp, &pvma, 0);
>         shmem_pseudo_vma_destroy(&pvma);
>
> @@ -1652,7 +1654,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
>                         count_memcg_event_mm(charge_mm, PGMAJFAULT);
>                 }
>                 /* Here we actually start the io */
> -               page = shmem_swapin(swap, gfp, info, index);
> +               page = shmem_swapin(swap, gfp, charge_mm, info, index);
>                 if (!page) {
>                         error = -ENOMEM;
>                         goto failed;
> diff --git a/mm/swap_state.c b/mm/swap_state.c
> index f06af84..1db73a2 100644
> --- a/mm/swap_state.c
> +++ b/mm/swap_state.c
> @@ -112,7 +112,7 @@ void show_swap_cache_info(void)
>   * but sets SwapCache flag and private instead of mapping and index.
>   */
>  int add_to_swap_cache(struct page *page, swp_entry_t entry,
> -                       gfp_t gfp, void **shadowp)
> +                       struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
>  {
>         struct address_space *address_space = swap_address_space(entry);
>         pgoff_t idx = swp_offset(entry);
> @@ -120,14 +120,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
>         unsigned long i, nr = compound_nr(page);
>         unsigned long nrexceptional = 0;
>         void *old;
> +       bool compound = !!compound_order(page);
> +       int error;
> +       struct mm_struct *mm = vma ? vma->vm_mm : current->mm;
> +       struct mem_cgroup *memcg;
>
>         VM_BUG_ON_PAGE(!PageLocked(page), page);
>         VM_BUG_ON_PAGE(PageSwapCache(page), page);
>         VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
>
>         page_ref_add(page, nr);
> +       /* PageSwapCache() prevent the page from being re-charged */
>         SetPageSwapCache(page);
>
> +       error = mem_cgroup_try_charge(page, mm, gfp, &memcg, compound);
> +       if (error) {
> +               ClearPageSwapCache(page);
> +               page_ref_sub(page, nr);
> +               return error;
> +       }
> +
>         do {
>                 xas_lock_irq(&xas);
>                 xas_create_range(&xas);
> @@ -153,11 +165,16 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
>                 xas_unlock_irq(&xas);
>         } while (xas_nomem(&xas, gfp));
>
> -       if (!xas_error(&xas))
> +       if (!xas_error(&xas)) {
> +               mem_cgroup_commit_charge(page, memcg, false, compound);
>                 return 0;
> +       }
> +
> +       mem_cgroup_cancel_charge(page, memcg, compound);
>
>         ClearPageSwapCache(page);
>         page_ref_sub(page, nr);
> +
>         return xas_error(&xas);
>  }
>
> @@ -221,7 +238,7 @@ int add_to_swap(struct page *page)
>         /*
>          * Add it to the swap cache.
>          */
> -       err = add_to_swap_cache(page, entry,
> +       err = add_to_swap_cache(page, entry, NULL,
>                         __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
>         if (err)
>                 /*
> @@ -431,7 +448,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
>                 /* May fail (-ENOMEM) if XArray node allocation failed. */
>                 __SetPageLocked(new_page);
>                 __SetPageSwapBacked(new_page);
> -               err = add_to_swap_cache(new_page, entry,
> +               err = add_to_swap_cache(new_page, entry, vma,
>                                 gfp_mask & GFP_KERNEL, NULL);
>                 if (likely(!err)) {
>                         /* Initiate read into locked page */
> --
> 2.7.4
>
>
Joonsoo Kim April 6, 2020, 1:03 a.m. UTC | #2
2020년 4월 4일 (토) 오전 3:29, Yang Shi <shy828301@gmail.com>님이 작성:
>
> On Thu, Apr 2, 2020 at 10:41 PM <js1304@gmail.com> wrote:
> >
> > From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >
> > Currently, some swapped-in pages are not charged to the memcg until
> > actual access to the page happens. I checked the code and found that
> > it could cause a problem. In this implementation, even if the memcg
> > is enabled, one can consume a lot of memory in the system by exploiting
> > this hole. For example, one can make all the pages swapped out and
> > then call madvise_willneed() to load the all swapped-out pages without
> > pressing the memcg. Although actual access requires charging, it's really
> > big benefit to load the swapped-out pages to the memory without pressing
> > the memcg.
> >
> > And, for workingset detection which is implemented on the following patch,
> > a memcg should be committed before the workingset detection is executed.
> > For this purpose, the best solution, I think, is charging the page when
> > adding to the swap cache. Charging there is not that hard. Caller of
> > adding the page to the swap cache has enough information about the charged
> > memcg. So, what we need to do is just passing this information to
> > the right place.
> >
> > With this patch, specific memcg could be pressured more since readahead
> > pages are also charged to it now. This would result in performance
> > degradation to that user but it would be fair since that readahead is for
> > that user.
>
> If I read the code correctly, the readahead pages may be *not* charged
> to it at all but other memcgs since mem_cgroup_try_charge() would
> retrieve the target memcg id from the swap entry then charge to it
> (generally it is the memcg from who the page is swapped out). So, it
> may open a backdoor to let one memcg stress other memcgs?

It looks like you talk about the call path on CONFIG_MEMCG_SWAP.

The owner (task) for a anonymous page cannot be changed. It means that
the previous owner written on the swap entry will be the next user. So,
I think that using the target memcg id from the swap entry for readahead pages
is valid way.

As you concerned, if someone can control swap-readahead to readahead
other's swap entry, one memcg could stress other memcg by using the fact above.
However, as far as I know, there is no explicit way to readahead other's swap
entry so no problem.

Thanks.
Yang Shi April 7, 2020, 12:22 a.m. UTC | #3
On Sun, Apr 5, 2020 at 6:03 PM Joonsoo Kim <js1304@gmail.com> wrote:
>
> 2020년 4월 4일 (토) 오전 3:29, Yang Shi <shy828301@gmail.com>님이 작성:
> >
> > On Thu, Apr 2, 2020 at 10:41 PM <js1304@gmail.com> wrote:
> > >
> > > From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > >
> > > Currently, some swapped-in pages are not charged to the memcg until
> > > actual access to the page happens. I checked the code and found that
> > > it could cause a problem. In this implementation, even if the memcg
> > > is enabled, one can consume a lot of memory in the system by exploiting
> > > this hole. For example, one can make all the pages swapped out and
> > > then call madvise_willneed() to load the all swapped-out pages without
> > > pressing the memcg. Although actual access requires charging, it's really
> > > big benefit to load the swapped-out pages to the memory without pressing
> > > the memcg.
> > >
> > > And, for workingset detection which is implemented on the following patch,
> > > a memcg should be committed before the workingset detection is executed.
> > > For this purpose, the best solution, I think, is charging the page when
> > > adding to the swap cache. Charging there is not that hard. Caller of
> > > adding the page to the swap cache has enough information about the charged
> > > memcg. So, what we need to do is just passing this information to
> > > the right place.
> > >
> > > With this patch, specific memcg could be pressured more since readahead
> > > pages are also charged to it now. This would result in performance
> > > degradation to that user but it would be fair since that readahead is for
> > > that user.
> >
> > If I read the code correctly, the readahead pages may be *not* charged
> > to it at all but other memcgs since mem_cgroup_try_charge() would
> > retrieve the target memcg id from the swap entry then charge to it
> > (generally it is the memcg from who the page is swapped out). So, it
> > may open a backdoor to let one memcg stress other memcgs?
>
> It looks like you talk about the call path on CONFIG_MEMCG_SWAP.
>
> The owner (task) for a anonymous page cannot be changed. It means that
> the previous owner written on the swap entry will be the next user. So,
> I think that using the target memcg id from the swap entry for readahead pages
> is valid way.
>
> As you concerned, if someone can control swap-readahead to readahead
> other's swap entry, one memcg could stress other memcg by using the fact above.
> However, as far as I know, there is no explicit way to readahead other's swap
> entry so no problem.

Swap cluster readahead would readahead in pages on consecutive swap
entries which may belong to different memcgs, however I just figured
out patch #8 ("mm/swap: do not readahead if the previous owner of the
swap entry isn't me") would prevent from reading ahead pages belonging
to other memcgs. This would kill the potential problem.

> Thanks.
Joonsoo Kim April 7, 2020, 1:27 a.m. UTC | #4
2020년 4월 7일 (화) 오전 9:22, Yang Shi <shy828301@gmail.com>님이 작성:
>
> On Sun, Apr 5, 2020 at 6:03 PM Joonsoo Kim <js1304@gmail.com> wrote:
> >
> > 2020년 4월 4일 (토) 오전 3:29, Yang Shi <shy828301@gmail.com>님이 작성:
> > >
> > > On Thu, Apr 2, 2020 at 10:41 PM <js1304@gmail.com> wrote:
> > > >
> > > > From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > > >
> > > > Currently, some swapped-in pages are not charged to the memcg until
> > > > actual access to the page happens. I checked the code and found that
> > > > it could cause a problem. In this implementation, even if the memcg
> > > > is enabled, one can consume a lot of memory in the system by exploiting
> > > > this hole. For example, one can make all the pages swapped out and
> > > > then call madvise_willneed() to load the all swapped-out pages without
> > > > pressing the memcg. Although actual access requires charging, it's really
> > > > big benefit to load the swapped-out pages to the memory without pressing
> > > > the memcg.
> > > >
> > > > And, for workingset detection which is implemented on the following patch,
> > > > a memcg should be committed before the workingset detection is executed.
> > > > For this purpose, the best solution, I think, is charging the page when
> > > > adding to the swap cache. Charging there is not that hard. Caller of
> > > > adding the page to the swap cache has enough information about the charged
> > > > memcg. So, what we need to do is just passing this information to
> > > > the right place.
> > > >
> > > > With this patch, specific memcg could be pressured more since readahead
> > > > pages are also charged to it now. This would result in performance
> > > > degradation to that user but it would be fair since that readahead is for
> > > > that user.
> > >
> > > If I read the code correctly, the readahead pages may be *not* charged
> > > to it at all but other memcgs since mem_cgroup_try_charge() would
> > > retrieve the target memcg id from the swap entry then charge to it
> > > (generally it is the memcg from who the page is swapped out). So, it
> > > may open a backdoor to let one memcg stress other memcgs?
> >
> > It looks like you talk about the call path on CONFIG_MEMCG_SWAP.
> >
> > The owner (task) for a anonymous page cannot be changed. It means that
> > the previous owner written on the swap entry will be the next user. So,
> > I think that using the target memcg id from the swap entry for readahead pages
> > is valid way.
> >
> > As you concerned, if someone can control swap-readahead to readahead
> > other's swap entry, one memcg could stress other memcg by using the fact above.
> > However, as far as I know, there is no explicit way to readahead other's swap
> > entry so no problem.
>
> Swap cluster readahead would readahead in pages on consecutive swap
> entries which may belong to different memcgs, however I just figured
> out patch #8 ("mm/swap: do not readahead if the previous owner of the
> swap entry isn't me") would prevent from reading ahead pages belonging
> to other memcgs. This would kill the potential problem.

Yes, that patch kill the potential problem. However, I think that swap cluster
readahead would not open the backdoor even without the patch #8 in
CONFIG_MEMCG_SWAP case, because:

1. consecutive swap space is usually filled by the same task.
2. swap cluster readahead needs a large I/O price to the offender and effect
isn't serious to the target.
3. those pages would be charged to their previous owner and it is valid.

Thanks.
Johannes Weiner April 16, 2020, 4:11 p.m. UTC | #5
Hello Joonsoo,

On Fri, Apr 03, 2020 at 02:40:43PM +0900, js1304@gmail.com wrote:
> @@ -112,7 +112,7 @@ void show_swap_cache_info(void)
>   * but sets SwapCache flag and private instead of mapping and index.
>   */
>  int add_to_swap_cache(struct page *page, swp_entry_t entry,
> -			gfp_t gfp, void **shadowp)
> +			struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
>  {
>  	struct address_space *address_space = swap_address_space(entry);
>  	pgoff_t idx = swp_offset(entry);
> @@ -120,14 +120,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
>  	unsigned long i, nr = compound_nr(page);
>  	unsigned long nrexceptional = 0;
>  	void *old;
> +	bool compound = !!compound_order(page);
> +	int error;
> +	struct mm_struct *mm = vma ? vma->vm_mm : current->mm;
> +	struct mem_cgroup *memcg;
>  
>  	VM_BUG_ON_PAGE(!PageLocked(page), page);
>  	VM_BUG_ON_PAGE(PageSwapCache(page), page);
>  	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
>  
>  	page_ref_add(page, nr);
> +	/* PageSwapCache() prevent the page from being re-charged */
>  	SetPageSwapCache(page);
>  
> +	error = mem_cgroup_try_charge(page, mm, gfp, &memcg, compound);
> +	if (error) {
> +		ClearPageSwapCache(page);
> +		page_ref_sub(page, nr);
> +		return error;
> +	}
> +
>  	do {
>  		xas_lock_irq(&xas);
>  		xas_create_range(&xas);
> @@ -153,11 +165,16 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
>  		xas_unlock_irq(&xas);
>  	} while (xas_nomem(&xas, gfp));
>  
> -	if (!xas_error(&xas))
> +	if (!xas_error(&xas)) {
> +		mem_cgroup_commit_charge(page, memcg, false, compound);

Unfortunately you cannot commit here yet because the rmap isn't set up
and that will cause memcg_charge_statistics() to account the page
incorrectly as file. And rmap is only set up during a page fault.

This needs a bit of a rework of the memcg charging code that appears
out of scope for your patches. I'm preparing a series right now to do
just that. It'll also fix the swap ownership tracking problem when the
swap controller is disabled, so we don't have to choose between
charging the wrong cgroup or hampering swap readahead efficiency.

The patches also unblock Alex Shi's "per lruvec lru_lock for memcg"
series, which is all the more indication that memcg needs fixing in
that area.
Joonsoo Kim April 17, 2020, 1:38 a.m. UTC | #6
2020년 4월 17일 (금) 오전 1:11, Johannes Weiner <hannes@cmpxchg.org>님이 작성:
>
> Hello Joonsoo,
>
> On Fri, Apr 03, 2020 at 02:40:43PM +0900, js1304@gmail.com wrote:
> > @@ -112,7 +112,7 @@ void show_swap_cache_info(void)
> >   * but sets SwapCache flag and private instead of mapping and index.
> >   */
> >  int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > -                     gfp_t gfp, void **shadowp)
> > +                     struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
> >  {
> >       struct address_space *address_space = swap_address_space(entry);
> >       pgoff_t idx = swp_offset(entry);
> > @@ -120,14 +120,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
> >       unsigned long i, nr = compound_nr(page);
> >       unsigned long nrexceptional = 0;
> >       void *old;
> > +     bool compound = !!compound_order(page);
> > +     int error;
> > +     struct mm_struct *mm = vma ? vma->vm_mm : current->mm;
> > +     struct mem_cgroup *memcg;
> >
> >       VM_BUG_ON_PAGE(!PageLocked(page), page);
> >       VM_BUG_ON_PAGE(PageSwapCache(page), page);
> >       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
> >
> >       page_ref_add(page, nr);
> > +     /* PageSwapCache() prevent the page from being re-charged */
> >       SetPageSwapCache(page);
> >
> > +     error = mem_cgroup_try_charge(page, mm, gfp, &memcg, compound);
> > +     if (error) {
> > +             ClearPageSwapCache(page);
> > +             page_ref_sub(page, nr);
> > +             return error;
> > +     }
> > +
> >       do {
> >               xas_lock_irq(&xas);
> >               xas_create_range(&xas);
> > @@ -153,11 +165,16 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
> >               xas_unlock_irq(&xas);
> >       } while (xas_nomem(&xas, gfp));
> >
> > -     if (!xas_error(&xas))
> > +     if (!xas_error(&xas)) {
> > +             mem_cgroup_commit_charge(page, memcg, false, compound);
>
> Unfortunately you cannot commit here yet because the rmap isn't set up
> and that will cause memcg_charge_statistics() to account the page
> incorrectly as file. And rmap is only set up during a page fault.

I also found this problem a few days ago. In my investigation, what we need for
anonymous page to make accounting correct is a way to find the type of the page,
file or anon, since there is no special code to use the rmap. And, I
think that it
could be done by checking NULL mapping or something else. Is there anything
I missed? And, I cannot find the function, memcg_charge_statistics(). Please
let me know the file name of this function.

This is just my curiosity and I agree what you commented below.

> This needs a bit of a rework of the memcg charging code that appears
> out of scope for your patches. I'm preparing a series right now to do
> just that. It'll also fix the swap ownership tracking problem when the
> swap controller is disabled, so we don't have to choose between
> charging the wrong cgroup or hampering swap readahead efficiency.

Sound good! I also think that these patches are out of scope of my series.
I will wait your patches. Could you let me know when your series is submitted?
I'd like to plan my work schedule based on your patch schedule.

> The patches also unblock Alex Shi's "per lruvec lru_lock for memcg"
> series, which is all the more indication that memcg needs fixing in
> that area.

Okay.

Thanks.
Johannes Weiner April 17, 2020, 3:31 a.m. UTC | #7
On Fri, Apr 17, 2020 at 10:38:53AM +0900, Joonsoo Kim wrote:
> 2020년 4월 17일 (금) 오전 1:11, Johannes Weiner <hannes@cmpxchg.org>님이 작성:
> >
> > Hello Joonsoo,
> >
> > On Fri, Apr 03, 2020 at 02:40:43PM +0900, js1304@gmail.com wrote:
> > > @@ -112,7 +112,7 @@ void show_swap_cache_info(void)
> > >   * but sets SwapCache flag and private instead of mapping and index.
> > >   */
> > >  int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > > -                     gfp_t gfp, void **shadowp)
> > > +                     struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
> > >  {
> > >       struct address_space *address_space = swap_address_space(entry);
> > >       pgoff_t idx = swp_offset(entry);
> > > @@ -120,14 +120,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > >       unsigned long i, nr = compound_nr(page);
> > >       unsigned long nrexceptional = 0;
> > >       void *old;
> > > +     bool compound = !!compound_order(page);
> > > +     int error;
> > > +     struct mm_struct *mm = vma ? vma->vm_mm : current->mm;
> > > +     struct mem_cgroup *memcg;
> > >
> > >       VM_BUG_ON_PAGE(!PageLocked(page), page);
> > >       VM_BUG_ON_PAGE(PageSwapCache(page), page);
> > >       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
> > >
> > >       page_ref_add(page, nr);
> > > +     /* PageSwapCache() prevent the page from being re-charged */
> > >       SetPageSwapCache(page);
> > >
> > > +     error = mem_cgroup_try_charge(page, mm, gfp, &memcg, compound);
> > > +     if (error) {
> > > +             ClearPageSwapCache(page);
> > > +             page_ref_sub(page, nr);
> > > +             return error;
> > > +     }
> > > +
> > >       do {
> > >               xas_lock_irq(&xas);
> > >               xas_create_range(&xas);
> > > @@ -153,11 +165,16 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > >               xas_unlock_irq(&xas);
> > >       } while (xas_nomem(&xas, gfp));
> > >
> > > -     if (!xas_error(&xas))
> > > +     if (!xas_error(&xas)) {
> > > +             mem_cgroup_commit_charge(page, memcg, false, compound);
> >
> > Unfortunately you cannot commit here yet because the rmap isn't set up
> > and that will cause memcg_charge_statistics() to account the page
> > incorrectly as file. And rmap is only set up during a page fault.
> 
> I also found this problem a few days ago. In my investigation, what we need for
> anonymous page to make accounting correct is a way to find the type of the page,
> file or anon, since there is no special code to use the rmap. And, I
> think that it
> could be done by checking NULL mapping or something else.

page->mapping is NULL for truncated file pages, file pages before they
are inserted into the page cache, and anon pages before the rmap. It's
not straight-forward to robustly tell those apart inside memcg.

But fundamentally, it's a silly problem to fix. We only need to tell
page types apart to maintain the MEMCG_CACHE and MEMCG_RSS
counters. But these are unnecessary duplicates of the NR_FILE_PAGES
and NR_ANON_MAPPED vmstat counters - counters for which we already
have accounting sites in generic code, and that already exist in the
per-cgroup vmstat arrays. We just need to link them.

So that's what I'm fixing instead: I'm adjusting the charging sequence
slightly so that page->mem_cgroup is stable when the core VM code
accounts pages by type. And then I'm hooking into these places with
mod_lruvec_page_state and friends, and ditching MEMCG_CACHE/MEMCG_RSS.

After that, memcg doesn't have to know about the types of pages at
all. It can focus on maintaining page_counters and page->mem_cgroup,
and leave the vmstat breakdown to generic VM code.

Then we can charge pages right after allocation, regardless of type.

[ Eventually, the memcg accounting interface shouldn't be much more
  than GFP_ACCOUNT (with memalloc_use_memcg() for faults, swap etc.),
  and the vmstat hooks. My patches don't quite get there, but that's
  the direction they're pushing. ]

> Is there anything I missed? And, I cannot find the function,
> memcg_charge_statistics(). Please let me know the file name of this
> function.

The correct name is mem_cgroup_charge_statistics(), my apologies.

> > This needs a bit of a rework of the memcg charging code that appears
> > out of scope for your patches. I'm preparing a series right now to do
> > just that. It'll also fix the swap ownership tracking problem when the
> > swap controller is disabled, so we don't have to choose between
> > charging the wrong cgroup or hampering swap readahead efficiency.
> 
> Sound good! I also think that these patches are out of scope of my series.
> I will wait your patches. Could you let me know when your series is submitted?
> I'd like to plan my work schedule based on your patch schedule.

I just finished the first draft of the series a few minutes ago. I'll
polish it a bit, make sure it compiles etc. ;-), and send it out for
review, testing and rebasing, hopefully tomorrow or early next week.
Joonsoo Kim April 17, 2020, 3:57 a.m. UTC | #8
2020년 4월 17일 (금) 오후 12:31, Johannes Weiner <hannes@cmpxchg.org>님이 작성:
>
> On Fri, Apr 17, 2020 at 10:38:53AM +0900, Joonsoo Kim wrote:
> > 2020년 4월 17일 (금) 오전 1:11, Johannes Weiner <hannes@cmpxchg.org>님이 작성:
> > >
> > > Hello Joonsoo,
> > >
> > > On Fri, Apr 03, 2020 at 02:40:43PM +0900, js1304@gmail.com wrote:
> > > > @@ -112,7 +112,7 @@ void show_swap_cache_info(void)
> > > >   * but sets SwapCache flag and private instead of mapping and index.
> > > >   */
> > > >  int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > > > -                     gfp_t gfp, void **shadowp)
> > > > +                     struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
> > > >  {
> > > >       struct address_space *address_space = swap_address_space(entry);
> > > >       pgoff_t idx = swp_offset(entry);
> > > > @@ -120,14 +120,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > > >       unsigned long i, nr = compound_nr(page);
> > > >       unsigned long nrexceptional = 0;
> > > >       void *old;
> > > > +     bool compound = !!compound_order(page);
> > > > +     int error;
> > > > +     struct mm_struct *mm = vma ? vma->vm_mm : current->mm;
> > > > +     struct mem_cgroup *memcg;
> > > >
> > > >       VM_BUG_ON_PAGE(!PageLocked(page), page);
> > > >       VM_BUG_ON_PAGE(PageSwapCache(page), page);
> > > >       VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
> > > >
> > > >       page_ref_add(page, nr);
> > > > +     /* PageSwapCache() prevent the page from being re-charged */
> > > >       SetPageSwapCache(page);
> > > >
> > > > +     error = mem_cgroup_try_charge(page, mm, gfp, &memcg, compound);
> > > > +     if (error) {
> > > > +             ClearPageSwapCache(page);
> > > > +             page_ref_sub(page, nr);
> > > > +             return error;
> > > > +     }
> > > > +
> > > >       do {
> > > >               xas_lock_irq(&xas);
> > > >               xas_create_range(&xas);
> > > > @@ -153,11 +165,16 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > > >               xas_unlock_irq(&xas);
> > > >       } while (xas_nomem(&xas, gfp));
> > > >
> > > > -     if (!xas_error(&xas))
> > > > +     if (!xas_error(&xas)) {
> > > > +             mem_cgroup_commit_charge(page, memcg, false, compound);
> > >
> > > Unfortunately you cannot commit here yet because the rmap isn't set up
> > > and that will cause memcg_charge_statistics() to account the page
> > > incorrectly as file. And rmap is only set up during a page fault.
> >
> > I also found this problem a few days ago. In my investigation, what we need for
> > anonymous page to make accounting correct is a way to find the type of the page,
> > file or anon, since there is no special code to use the rmap. And, I
> > think that it
> > could be done by checking NULL mapping or something else.
>
> page->mapping is NULL for truncated file pages, file pages before they
> are inserted into the page cache, and anon pages before the rmap. It's
> not straight-forward to robustly tell those apart inside memcg.

Okay.

> But fundamentally, it's a silly problem to fix. We only need to tell
> page types apart to maintain the MEMCG_CACHE and MEMCG_RSS
> counters. But these are unnecessary duplicates of the NR_FILE_PAGES
> and NR_ANON_MAPPED vmstat counters - counters for which we already
> have accounting sites in generic code, and that already exist in the
> per-cgroup vmstat arrays. We just need to link them.
>
> So that's what I'm fixing instead: I'm adjusting the charging sequence
> slightly so that page->mem_cgroup is stable when the core VM code
> accounts pages by type. And then I'm hooking into these places with
> mod_lruvec_page_state and friends, and ditching MEMCG_CACHE/MEMCG_RSS.
>
> After that, memcg doesn't have to know about the types of pages at
> all. It can focus on maintaining page_counters and page->mem_cgroup,
> and leave the vmstat breakdown to generic VM code.
>
> Then we can charge pages right after allocation, regardless of type.
>
> [ Eventually, the memcg accounting interface shouldn't be much more
>   than GFP_ACCOUNT (with memalloc_use_memcg() for faults, swap etc.),
>   and the vmstat hooks. My patches don't quite get there, but that's
>   the direction they're pushing. ]

Thanks for explanation! It will help me understand your future patches.

> > Is there anything I missed? And, I cannot find the function,
> > memcg_charge_statistics(). Please let me know the file name of this
> > function.
>
> The correct name is mem_cgroup_charge_statistics(), my apologies.

No problem.

> > > This needs a bit of a rework of the memcg charging code that appears
> > > out of scope for your patches. I'm preparing a series right now to do
> > > just that. It'll also fix the swap ownership tracking problem when the
> > > swap controller is disabled, so we don't have to choose between
> > > charging the wrong cgroup or hampering swap readahead efficiency.
> >
> > Sound good! I also think that these patches are out of scope of my series.
> > I will wait your patches. Could you let me know when your series is submitted?
> > I'd like to plan my work schedule based on your patch schedule.
>
> I just finished the first draft of the series a few minutes ago. I'll
> polish it a bit, make sure it compiles etc. ;-), and send it out for
> review, testing and rebasing, hopefully tomorrow or early next week.

Sound good! See you again next week. :)

Thanks.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 273de48..eea0700 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -409,7 +409,7 @@  extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *page);
 extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
-			gfp_t gfp, void **shadowp);
+			struct vm_area_struct *vma, gfp_t gfp, void **shadowp);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *page,
 			swp_entry_t entry, void *shadow);
@@ -567,7 +567,7 @@  static inline int add_to_swap(struct page *page)
 }
 
 static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
-					gfp_t gfp_mask, void **shadowp)
+			struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
 {
 	return -1;
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index 9e34b4e..8e28c1f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1369,7 +1369,7 @@  static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	if (list_empty(&info->swaplist))
 		list_add(&info->swaplist, &shmem_swaplist);
 
-	if (add_to_swap_cache(page, swap,
+	if (add_to_swap_cache(page, swap, NULL,
 			__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
 			NULL) == 0) {
 		spin_lock_irq(&info->lock);
@@ -1434,10 +1434,11 @@  static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 
 static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
-		struct shmem_inode_info *info, pgoff_t index)
+			struct mm_struct *mm, struct shmem_inode_info *info,
+			pgoff_t index)
 {
 	/* Create a pseudo vma that just contains the policy */
-	vma_init(vma, NULL);
+	vma_init(vma, mm);
 	/* Bias interleave by inode number to distribute better across nodes */
 	vma->vm_pgoff = index + info->vfs_inode.i_ino;
 	vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@ -1450,13 +1451,14 @@  static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
 }
 
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
+			struct mm_struct *mm, struct shmem_inode_info *info,
+			pgoff_t index)
 {
 	struct vm_area_struct pvma;
 	struct page *page;
 	struct vm_fault vmf;
 
-	shmem_pseudo_vma_init(&pvma, info, index);
+	shmem_pseudo_vma_init(&pvma, mm, info, index);
 	vmf.vma = &pvma;
 	vmf.address = 0;
 	page = swap_cluster_readahead(swap, gfp, &vmf);
@@ -1481,7 +1483,7 @@  static struct page *shmem_alloc_hugepage(gfp_t gfp,
 								XA_PRESENT))
 		return NULL;
 
-	shmem_pseudo_vma_init(&pvma, info, hindex);
+	shmem_pseudo_vma_init(&pvma, NULL, info, hindex);
 	page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
 			HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
 	shmem_pseudo_vma_destroy(&pvma);
@@ -1496,7 +1498,7 @@  static struct page *shmem_alloc_page(gfp_t gfp,
 	struct vm_area_struct pvma;
 	struct page *page;
 
-	shmem_pseudo_vma_init(&pvma, info, index);
+	shmem_pseudo_vma_init(&pvma, NULL, info, index);
 	page = alloc_page_vma(gfp, &pvma, 0);
 	shmem_pseudo_vma_destroy(&pvma);
 
@@ -1652,7 +1654,7 @@  static int shmem_swapin_page(struct inode *inode, pgoff_t index,
 			count_memcg_event_mm(charge_mm, PGMAJFAULT);
 		}
 		/* Here we actually start the io */
-		page = shmem_swapin(swap, gfp, info, index);
+		page = shmem_swapin(swap, gfp, charge_mm, info, index);
 		if (!page) {
 			error = -ENOMEM;
 			goto failed;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f06af84..1db73a2 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -112,7 +112,7 @@  void show_swap_cache_info(void)
  * but sets SwapCache flag and private instead of mapping and index.
  */
 int add_to_swap_cache(struct page *page, swp_entry_t entry,
-			gfp_t gfp, void **shadowp)
+			struct vm_area_struct *vma, gfp_t gfp, void **shadowp)
 {
 	struct address_space *address_space = swap_address_space(entry);
 	pgoff_t idx = swp_offset(entry);
@@ -120,14 +120,26 @@  int add_to_swap_cache(struct page *page, swp_entry_t entry,
 	unsigned long i, nr = compound_nr(page);
 	unsigned long nrexceptional = 0;
 	void *old;
+	bool compound = !!compound_order(page);
+	int error;
+	struct mm_struct *mm = vma ? vma->vm_mm : current->mm;
+	struct mem_cgroup *memcg;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_PAGE(PageSwapCache(page), page);
 	VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
 
 	page_ref_add(page, nr);
+	/* PageSwapCache() prevent the page from being re-charged */
 	SetPageSwapCache(page);
 
+	error = mem_cgroup_try_charge(page, mm, gfp, &memcg, compound);
+	if (error) {
+		ClearPageSwapCache(page);
+		page_ref_sub(page, nr);
+		return error;
+	}
+
 	do {
 		xas_lock_irq(&xas);
 		xas_create_range(&xas);
@@ -153,11 +165,16 @@  int add_to_swap_cache(struct page *page, swp_entry_t entry,
 		xas_unlock_irq(&xas);
 	} while (xas_nomem(&xas, gfp));
 
-	if (!xas_error(&xas))
+	if (!xas_error(&xas)) {
+		mem_cgroup_commit_charge(page, memcg, false, compound);
 		return 0;
+	}
+
+	mem_cgroup_cancel_charge(page, memcg, compound);
 
 	ClearPageSwapCache(page);
 	page_ref_sub(page, nr);
+
 	return xas_error(&xas);
 }
 
@@ -221,7 +238,7 @@  int add_to_swap(struct page *page)
 	/*
 	 * Add it to the swap cache.
 	 */
-	err = add_to_swap_cache(page, entry,
+	err = add_to_swap_cache(page, entry, NULL,
 			__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
 	if (err)
 		/*
@@ -431,7 +448,7 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/* May fail (-ENOMEM) if XArray node allocation failed. */
 		__SetPageLocked(new_page);
 		__SetPageSwapBacked(new_page);
-		err = add_to_swap_cache(new_page, entry,
+		err = add_to_swap_cache(new_page, entry, vma,
 				gfp_mask & GFP_KERNEL, NULL);
 		if (likely(!err)) {
 			/* Initiate read into locked page */