diff mbox series

[v4,6/8] mm/swap: implement workingset detection for anonymous LRU

Message ID 1584942732-2184-7-git-send-email-iamjoonsoo.kim@lge.com (mailing list archive)
State New, archived
Headers show
Series workingset protection/detection on the anonymous LRU list | expand

Commit Message

Joonsoo Kim March 23, 2020, 5:52 a.m. UTC
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>

This patch implements workingset detection for anonymous LRU.
All the infrastructure is implemented by the previous patches so this patch
just activates the workingset detection by installing/retrieving
the shadow entry.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
---
 include/linux/swap.h |  6 ++++++
 mm/memory.c          |  7 ++++++-
 mm/swap_state.c      | 20 ++++++++++++++++++--
 mm/vmscan.c          |  7 +++++--
 4 files changed, 35 insertions(+), 5 deletions(-)

Comments

Johannes Weiner March 23, 2020, 5:17 p.m. UTC | #1
On Mon, Mar 23, 2020 at 02:52:10PM +0900, js1304@gmail.com wrote:
> From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> 
> This patch implements workingset detection for anonymous LRU.
> All the infrastructure is implemented by the previous patches so this patch
> just activates the workingset detection by installing/retrieving
> the shadow entry.
> 
> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> ---
>  include/linux/swap.h |  6 ++++++
>  mm/memory.c          |  7 ++++++-
>  mm/swap_state.c      | 20 ++++++++++++++++++--
>  mm/vmscan.c          |  7 +++++--
>  4 files changed, 35 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 273de48..fb4772e 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -408,6 +408,7 @@ extern struct address_space *swapper_spaces[];
>  extern unsigned long total_swapcache_pages(void);
>  extern void show_swap_cache_info(void);
>  extern int add_to_swap(struct page *page);
> +extern void *get_shadow_from_swap_cache(swp_entry_t entry);
>  extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
>  			gfp_t gfp, void **shadowp);
>  extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
> @@ -566,6 +567,11 @@ static inline int add_to_swap(struct page *page)
>  	return 0;
>  }
>  
> +static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
> +{
> +	return NULL;
> +}
> +
>  static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
>  					gfp_t gfp_mask, void **shadowp)
>  {
> diff --git a/mm/memory.c b/mm/memory.c
> index 5f7813a..91a2097 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2925,10 +2925,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
>  							vmf->address);
>  			if (page) {
> +				void *shadow;
> +
>  				__SetPageLocked(page);
>  				__SetPageSwapBacked(page);
>  				set_page_private(page, entry.val);
> -				lru_cache_add_anon(page);
> +				shadow = get_shadow_from_swap_cache(entry);
> +				if (shadow)
> +					workingset_refault(page, shadow);

Hm, this is calling workingset_refault() on a page that isn't charged
to a cgroup yet. That means the refault stats and inactive age counter
will be bumped incorrectly in the root cgroup instead of the real one.

> +				lru_cache_add(page);
>  				swap_readpage(page, true);
>  			}
>  		} else {

You need to look up and remember the shadow entry at the top and call
workingset_refault() after mem_cgroup_commit_charge() has run.

It'd be nice if we could do the shadow lookup for everybody in
lookup_swap_cache(), but that's subject to race conditions if multiple
faults on the same swap page happen in multiple vmas concurrently. The
swapcache bypass scenario is only safe because it checks that there is
a single pte under the mmap sem to prevent forking. So it looks like
you have to bubble up the shadow entry through swapin_readahead().
Joonsoo Kim March 24, 2020, 6:25 a.m. UTC | #2
2020년 3월 24일 (화) 오전 2:17, Johannes Weiner <hannes@cmpxchg.org>님이 작성:
>
> On Mon, Mar 23, 2020 at 02:52:10PM +0900, js1304@gmail.com wrote:
> > From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> >
> > This patch implements workingset detection for anonymous LRU.
> > All the infrastructure is implemented by the previous patches so this patch
> > just activates the workingset detection by installing/retrieving
> > the shadow entry.
> >
> > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > ---
> >  include/linux/swap.h |  6 ++++++
> >  mm/memory.c          |  7 ++++++-
> >  mm/swap_state.c      | 20 ++++++++++++++++++--
> >  mm/vmscan.c          |  7 +++++--
> >  4 files changed, 35 insertions(+), 5 deletions(-)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index 273de48..fb4772e 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -408,6 +408,7 @@ extern struct address_space *swapper_spaces[];
> >  extern unsigned long total_swapcache_pages(void);
> >  extern void show_swap_cache_info(void);
> >  extern int add_to_swap(struct page *page);
> > +extern void *get_shadow_from_swap_cache(swp_entry_t entry);
> >  extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
> >                       gfp_t gfp, void **shadowp);
> >  extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
> > @@ -566,6 +567,11 @@ static inline int add_to_swap(struct page *page)
> >       return 0;
> >  }
> >
> > +static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
> > +{
> > +     return NULL;
> > +}
> > +
> >  static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
> >                                       gfp_t gfp_mask, void **shadowp)
> >  {
> > diff --git a/mm/memory.c b/mm/memory.c
> > index 5f7813a..91a2097 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -2925,10 +2925,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> >                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
> >                                                       vmf->address);
> >                       if (page) {
> > +                             void *shadow;
> > +
> >                               __SetPageLocked(page);
> >                               __SetPageSwapBacked(page);
> >                               set_page_private(page, entry.val);
> > -                             lru_cache_add_anon(page);
> > +                             shadow = get_shadow_from_swap_cache(entry);
> > +                             if (shadow)
> > +                                     workingset_refault(page, shadow);
>
> Hm, this is calling workingset_refault() on a page that isn't charged
> to a cgroup yet. That means the refault stats and inactive age counter
> will be bumped incorrectly in the root cgroup instead of the real one.

Okay.

> > +                             lru_cache_add(page);
> >                               swap_readpage(page, true);
> >                       }
> >               } else {
>
> You need to look up and remember the shadow entry at the top and call
> workingset_refault() after mem_cgroup_commit_charge() has run.

Okay. I will call workingset_refault() after charging.

I completely missed that workingset_refault() should be called after charging.
workingset_refault() in __read_swap_cache_async() also has the same problem.

> It'd be nice if we could do the shadow lookup for everybody in
> lookup_swap_cache(), but that's subject to race conditions if multiple
> faults on the same swap page happen in multiple vmas concurrently. The
> swapcache bypass scenario is only safe because it checks that there is
> a single pte under the mmap sem to prevent forking. So it looks like
> you have to bubble up the shadow entry through swapin_readahead().

The problem looks not that easy. Hmm...

In current code, there is a large time gap between the shadow entries
are poped up and the page is charged to the memcg, especially,
for readahead-ed pages. We cannot maintain the shadow entries of
the readahead-ed pages until the pages are charged.

My plan to solve this problem is propagating the charged mm to
__read_swap_cache_async(), like as file cache, charging when
the page is added on to the swap cache and calling workingset_refault()
there. Charging will only occur if:

1. faulted page
2. readahead-ed page with the shadow entry for the same memcg

Also, readahead only happens when shadow entry's memcg is the same
with the charged memcg. If not the same, it's mostly not ours so
readahead isn't needed.

Please let me know how you think of the feasibility of this idea.

Thanks.
Joonsoo Kim April 2, 2020, 5:50 a.m. UTC | #3
2020년 3월 24일 (화) 오후 3:25, Joonsoo Kim <js1304@gmail.com>님이 작성:
>
> 2020년 3월 24일 (화) 오전 2:17, Johannes Weiner <hannes@cmpxchg.org>님이 작성:
> >
> > On Mon, Mar 23, 2020 at 02:52:10PM +0900, js1304@gmail.com wrote:
> > > From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > >
> > > This patch implements workingset detection for anonymous LRU.
> > > All the infrastructure is implemented by the previous patches so this patch
> > > just activates the workingset detection by installing/retrieving
> > > the shadow entry.
> > >
> > > Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > > ---
> > >  include/linux/swap.h |  6 ++++++
> > >  mm/memory.c          |  7 ++++++-
> > >  mm/swap_state.c      | 20 ++++++++++++++++++--
> > >  mm/vmscan.c          |  7 +++++--
> > >  4 files changed, 35 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > > index 273de48..fb4772e 100644
> > > --- a/include/linux/swap.h
> > > +++ b/include/linux/swap.h
> > > @@ -408,6 +408,7 @@ extern struct address_space *swapper_spaces[];
> > >  extern unsigned long total_swapcache_pages(void);
> > >  extern void show_swap_cache_info(void);
> > >  extern int add_to_swap(struct page *page);
> > > +extern void *get_shadow_from_swap_cache(swp_entry_t entry);
> > >  extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > >                       gfp_t gfp, void **shadowp);
> > >  extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
> > > @@ -566,6 +567,11 @@ static inline int add_to_swap(struct page *page)
> > >       return 0;
> > >  }
> > >
> > > +static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
> > > +{
> > > +     return NULL;
> > > +}
> > > +
> > >  static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
> > >                                       gfp_t gfp_mask, void **shadowp)
> > >  {
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index 5f7813a..91a2097 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -2925,10 +2925,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
> > >                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
> > >                                                       vmf->address);
> > >                       if (page) {
> > > +                             void *shadow;
> > > +
> > >                               __SetPageLocked(page);
> > >                               __SetPageSwapBacked(page);
> > >                               set_page_private(page, entry.val);
> > > -                             lru_cache_add_anon(page);
> > > +                             shadow = get_shadow_from_swap_cache(entry);
> > > +                             if (shadow)
> > > +                                     workingset_refault(page, shadow);
> >
> > Hm, this is calling workingset_refault() on a page that isn't charged
> > to a cgroup yet. That means the refault stats and inactive age counter
> > will be bumped incorrectly in the root cgroup instead of the real one.
>
> Okay.
>
> > > +                             lru_cache_add(page);
> > >                               swap_readpage(page, true);
> > >                       }
> > >               } else {
> >
> > You need to look up and remember the shadow entry at the top and call
> > workingset_refault() after mem_cgroup_commit_charge() has run.
>
> Okay. I will call workingset_refault() after charging.
>
> I completely missed that workingset_refault() should be called after charging.
> workingset_refault() in __read_swap_cache_async() also has the same problem.
>
> > It'd be nice if we could do the shadow lookup for everybody in
> > lookup_swap_cache(), but that's subject to race conditions if multiple
> > faults on the same swap page happen in multiple vmas concurrently. The
> > swapcache bypass scenario is only safe because it checks that there is
> > a single pte under the mmap sem to prevent forking. So it looks like
> > you have to bubble up the shadow entry through swapin_readahead().
>
> The problem looks not that easy. Hmm...
>
> In current code, there is a large time gap between the shadow entries
> are poped up and the page is charged to the memcg, especially,
> for readahead-ed pages. We cannot maintain the shadow entries of
> the readahead-ed pages until the pages are charged.
>
> My plan to solve this problem is propagating the charged mm to
> __read_swap_cache_async(), like as file cache, charging when
> the page is added on to the swap cache and calling workingset_refault()
> there. Charging will only occur if:
>
> 1. faulted page
> 2. readahead-ed page with the shadow entry for the same memcg
>
> Also, readahead only happens when shadow entry's memcg is the same
> with the charged memcg. If not the same, it's mostly not ours so
> readahead isn't needed.
>
> Please let me know how you think of the feasibility of this idea.

Hello, Johannes.

Could you let me know your opinion about the idea above?
In fact, since your reply is delayed, I completed the solution about the
above idea. If you want, I will submit it first. Then, we could discuss
the solution more easily.

Thanks.
Johannes Weiner April 2, 2020, 3:14 p.m. UTC | #4
On Thu, Apr 02, 2020 at 02:50:28PM +0900, Joonsoo Kim wrote:
> 2020년 3월 24일 (화) 오후 3:25, Joonsoo Kim <js1304@gmail.com>님이 작성:
> > The problem looks not that easy. Hmm...
> >
> > In current code, there is a large time gap between the shadow entries
> > are poped up and the page is charged to the memcg, especially,
> > for readahead-ed pages. We cannot maintain the shadow entries of
> > the readahead-ed pages until the pages are charged.
> >
> > My plan to solve this problem is propagating the charged mm to
> > __read_swap_cache_async(), like as file cache, charging when
> > the page is added on to the swap cache and calling workingset_refault()
> > there. Charging will only occur if:
> >
> > 1. faulted page
> > 2. readahead-ed page with the shadow entry for the same memcg
> >
> > Also, readahead only happens when shadow entry's memcg is the same
> > with the charged memcg. If not the same, it's mostly not ours so
> > readahead isn't needed.
> >
> > Please let me know how you think of the feasibility of this idea.
> 
> Hello, Johannes.
> 
> Could you let me know your opinion about the idea above?
> In fact, since your reply is delayed, I completed the solution about the
> above idea. If you want, I will submit it first. Then, we could discuss
> the solution more easily.

It's probably easiest if you send out your implementation and we
discuss it over the code.

Thanks!
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 273de48..fb4772e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -408,6 +408,7 @@  extern struct address_space *swapper_spaces[];
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *page);
+extern void *get_shadow_from_swap_cache(swp_entry_t entry);
 extern int add_to_swap_cache(struct page *page, swp_entry_t entry,
 			gfp_t gfp, void **shadowp);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
@@ -566,6 +567,11 @@  static inline int add_to_swap(struct page *page)
 	return 0;
 }
 
+static inline void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+	return NULL;
+}
+
 static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
 					gfp_t gfp_mask, void **shadowp)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 5f7813a..91a2097 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2925,10 +2925,15 @@  vm_fault_t do_swap_page(struct vm_fault *vmf)
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 							vmf->address);
 			if (page) {
+				void *shadow;
+
 				__SetPageLocked(page);
 				__SetPageSwapBacked(page);
 				set_page_private(page, entry.val);
-				lru_cache_add_anon(page);
+				shadow = get_shadow_from_swap_cache(entry);
+				if (shadow)
+					workingset_refault(page, shadow);
+				lru_cache_add(page);
 				swap_readpage(page, true);
 			}
 		} else {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f06af84..f996455 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -107,6 +107,18 @@  void show_swap_cache_info(void)
 	printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 
+void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+	struct address_space *address_space = swap_address_space(entry);
+	pgoff_t idx = swp_offset(entry);
+	struct page *page;
+
+	page = find_get_entry(address_space, idx);
+	if (xa_is_value(page))
+		return page;
+	return NULL;
+}
+
 /*
  * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
@@ -376,6 +388,7 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	struct page *found_page = NULL, *new_page = NULL;
 	struct swap_info_struct *si;
 	int err;
+	void *shadow;
 	*new_page_allocated = false;
 
 	do {
@@ -431,12 +444,15 @@  struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/* May fail (-ENOMEM) if XArray node allocation failed. */
 		__SetPageLocked(new_page);
 		__SetPageSwapBacked(new_page);
+		shadow = NULL;
 		err = add_to_swap_cache(new_page, entry,
-				gfp_mask & GFP_KERNEL, NULL);
+				gfp_mask & GFP_KERNEL, &shadow);
 		if (likely(!err)) {
 			/* Initiate read into locked page */
 			SetPageWorkingset(new_page);
-			lru_cache_add_anon(new_page);
+			if (shadow)
+				workingset_refault(new_page, shadow);
+			lru_cache_add(new_page);
 			*new_page_allocated = true;
 			return new_page;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 9871861..b37cc26 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -867,6 +867,7 @@  static int __remove_mapping(struct address_space *mapping, struct page *page,
 {
 	unsigned long flags;
 	int refcount;
+	void *shadow = NULL;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(mapping != page_mapping(page));
@@ -909,12 +910,13 @@  static int __remove_mapping(struct address_space *mapping, struct page *page,
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
 		mem_cgroup_swapout(page, swap);
-		__delete_from_swap_cache(page, swap, NULL);
+		if (reclaimed && !mapping_exiting(mapping))
+			shadow = workingset_eviction(page, target_memcg);
+		__delete_from_swap_cache(page, swap, shadow);
 		xa_unlock_irqrestore(&mapping->i_pages, flags);
 		put_swap_page(page, swap);
 	} else {
 		void (*freepage)(struct page *);
-		void *shadow = NULL;
 
 		freepage = mapping->a_ops->freepage;
 		/*
@@ -1476,6 +1478,7 @@  static unsigned long shrink_page_list(struct list_head *page_list,
 			SetPageActive(page);
 			stat->nr_activate[type] += nr_pages;
 			count_memcg_page_event(page, PGACTIVATE);
+			workingset_activation(page);
 		}
 keep_locked:
 		unlock_page(page);