diff mbox series

[RESEND,v2] mm: don't call lru draining in the nested lru_cache_disable

Message ID 20211230193627.495145-1-minchan@kernel.org (mailing list archive)
State New
Headers show
Series [RESEND,v2] mm: don't call lru draining in the nested lru_cache_disable | expand

Commit Message

Minchan Kim Dec. 30, 2021, 7:36 p.m. UTC
lru_cache_disable involves IPIs to drain pagevec of each core,
which sometimes takes quite long time to complete depending
on cpu's business, which makes allocation too slow up to
sveral hundredth milliseconds. Furthermore, the repeated draining
in the alloc_contig_range makes thing worse considering caller
of alloc_contig_range usually tries multiple times in the loop.

This patch makes the lru_cache_disable aware of the fact the
pagevec was already disabled. With that, user of alloc_contig_range
can disable the lru cache in advance in their context during the
repeated trial so they can avoid the multiple costly draining
in cma allocation.

Signed-off-by: Minchan Kim <minchan@kernel.org>
---
 * from v1 - https://lore.kernel.org/lkml/20211206221006.946661-1-minchan@kernel.org/
   * fix lru_cache_disable race - akpm

 include/linux/swap.h | 14 ++------------
 mm/cma.c             |  5 +++++
 mm/swap.c            | 30 ++++++++++++++++++++++++++++--
 3 files changed, 35 insertions(+), 14 deletions(-)

Comments

Minchan Kim Jan. 6, 2022, 6:14 p.m. UTC | #1
On Thu, Dec 30, 2021 at 11:36:27AM -0800, Minchan Kim wrote:
> lru_cache_disable involves IPIs to drain pagevec of each core,
> which sometimes takes quite long time to complete depending
> on cpu's business, which makes allocation too slow up to
> sveral hundredth milliseconds. Furthermore, the repeated draining
> in the alloc_contig_range makes thing worse considering caller
> of alloc_contig_range usually tries multiple times in the loop.
> 
> This patch makes the lru_cache_disable aware of the fact the
> pagevec was already disabled. With that, user of alloc_contig_range
> can disable the lru cache in advance in their context during the
> repeated trial so they can avoid the multiple costly draining
> in cma allocation.

Hi Folks,

Any comment to proceed the work?

> 
> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  * from v1 - https://lore.kernel.org/lkml/20211206221006.946661-1-minchan@kernel.org/
>    * fix lru_cache_disable race - akpm
> 
>  include/linux/swap.h | 14 ++------------
>  mm/cma.c             |  5 +++++
>  mm/swap.c            | 30 ++++++++++++++++++++++++++++--
>  3 files changed, 35 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ba52f3a3478e..fe18e86a4f13 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -348,19 +348,9 @@ extern void lru_note_cost_page(struct page *);
>  extern void lru_cache_add(struct page *);
>  extern void mark_page_accessed(struct page *);
>  
> -extern atomic_t lru_disable_count;
> -
> -static inline bool lru_cache_disabled(void)
> -{
> -	return atomic_read(&lru_disable_count);
> -}
> -
> -static inline void lru_cache_enable(void)
> -{
> -	atomic_dec(&lru_disable_count);
> -}
> -
> +extern bool lru_cache_disabled(void);
>  extern void lru_cache_disable(void);
> +extern void lru_cache_enable(void);
>  extern void lru_add_drain(void);
>  extern void lru_add_drain_cpu(int cpu);
>  extern void lru_add_drain_cpu_zone(struct zone *zone);
> diff --git a/mm/cma.c b/mm/cma.c
> index 995e15480937..60be555c5b95 100644
> --- a/mm/cma.c
> +++ b/mm/cma.c
> @@ -30,6 +30,7 @@
>  #include <linux/cma.h>
>  #include <linux/highmem.h>
>  #include <linux/io.h>
> +#include <linux/swap.h>
>  #include <linux/kmemleak.h>
>  #include <trace/events/cma.h>
>  
> @@ -453,6 +454,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
>  	if (bitmap_count > bitmap_maxno)
>  		goto out;
>  
> +	lru_cache_disable();
> +
>  	for (;;) {
>  		spin_lock_irq(&cma->lock);
>  		bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
> @@ -492,6 +495,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
>  		start = bitmap_no + mask + 1;
>  	}
>  
> +	lru_cache_enable();
> +
>  	trace_cma_alloc_finish(cma->name, pfn, page, count, align);
>  
>  	/*
> diff --git a/mm/swap.c b/mm/swap.c
> index af3cad4e5378..5f89d7c9a54e 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -847,7 +847,17 @@ void lru_add_drain_all(void)
>  }
>  #endif /* CONFIG_SMP */
>  
> -atomic_t lru_disable_count = ATOMIC_INIT(0);
> +static atomic_t lru_disable_count = ATOMIC_INIT(0);
> +
> +bool lru_cache_disabled(void)
> +{
> +	return atomic_read(&lru_disable_count) != 0;
> +}
> +
> +void lru_cache_enable(void)
> +{
> +	atomic_dec(&lru_disable_count);
> +}
>  
>  /*
>   * lru_cache_disable() needs to be called before we start compiling
> @@ -859,7 +869,21 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
>   */
>  void lru_cache_disable(void)
>  {
> -	atomic_inc(&lru_disable_count);
> +	static DEFINE_MUTEX(lock);
> +
> +	/*
> +	 * The lock gaurantees lru_cache is drained when the function
> +	 * returned.
> +	 */
> +	mutex_lock(&lock);
> +	/*
> +	 * If someone is already disabled lru_cache, just return with
> +	 * increasing the lru_disable_count.
> +	 */
> +	if (atomic_inc_not_zero(&lru_disable_count)) {
> +		mutex_unlock(&lock);
> +		return;
> +	}
>  #ifdef CONFIG_SMP
>  	/*
>  	 * lru_add_drain_all in the force mode will schedule draining on
> @@ -873,6 +897,8 @@ void lru_cache_disable(void)
>  #else
>  	lru_add_and_bh_lrus_drain();
>  #endif
> +	atomic_inc(&lru_disable_count);
> +	mutex_unlock(&lock);
>  }
>  
>  /**
> -- 
> 2.34.1.448.ga2b2bfdf31-goog
>
Michal Hocko Jan. 17, 2022, 1:47 p.m. UTC | #2
On Thu 30-12-21 11:36:27, Minchan Kim wrote:
> lru_cache_disable involves IPIs to drain pagevec of each core,
> which sometimes takes quite long time to complete depending
> on cpu's business, which makes allocation too slow up to
> sveral hundredth milliseconds. Furthermore, the repeated draining
> in the alloc_contig_range makes thing worse considering caller
> of alloc_contig_range usually tries multiple times in the loop.
>
> This patch makes the lru_cache_disable aware of the fact the
> pagevec was already disabled. With that, user of alloc_contig_range
> can disable the lru cache in advance in their context during the
> repeated trial so they can avoid the multiple costly draining
> in cma allocation.

Do you have any numbers on any improvements?

Now to the change. I do not like this much to be honest. LRU cache
disabling is a complex synchronization scheme implemented in
__lru_add_drain_all now you are stacking another level on top of that.

More fundamentally though. I am not sure I understand the problem TBH.
What prevents you from calling lru_cache_disable at the cma level in the
first place?

> Signed-off-by: Minchan Kim <minchan@kernel.org>
> ---
>  * from v1 - https://lore.kernel.org/lkml/20211206221006.946661-1-minchan@kernel.org/
>    * fix lru_cache_disable race - akpm
> 
>  include/linux/swap.h | 14 ++------------
>  mm/cma.c             |  5 +++++
>  mm/swap.c            | 30 ++++++++++++++++++++++++++++--
>  3 files changed, 35 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index ba52f3a3478e..fe18e86a4f13 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -348,19 +348,9 @@ extern void lru_note_cost_page(struct page *);
>  extern void lru_cache_add(struct page *);
>  extern void mark_page_accessed(struct page *);
>  
> -extern atomic_t lru_disable_count;
> -
> -static inline bool lru_cache_disabled(void)
> -{
> -	return atomic_read(&lru_disable_count);
> -}
> -
> -static inline void lru_cache_enable(void)
> -{
> -	atomic_dec(&lru_disable_count);
> -}
> -
> +extern bool lru_cache_disabled(void);
>  extern void lru_cache_disable(void);
> +extern void lru_cache_enable(void);
>  extern void lru_add_drain(void);
>  extern void lru_add_drain_cpu(int cpu);
>  extern void lru_add_drain_cpu_zone(struct zone *zone);
> diff --git a/mm/cma.c b/mm/cma.c
> index 995e15480937..60be555c5b95 100644
> --- a/mm/cma.c
> +++ b/mm/cma.c
> @@ -30,6 +30,7 @@
>  #include <linux/cma.h>
>  #include <linux/highmem.h>
>  #include <linux/io.h>
> +#include <linux/swap.h>
>  #include <linux/kmemleak.h>
>  #include <trace/events/cma.h>
>  
> @@ -453,6 +454,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
>  	if (bitmap_count > bitmap_maxno)
>  		goto out;
>  
> +	lru_cache_disable();
> +
>  	for (;;) {
>  		spin_lock_irq(&cma->lock);
>  		bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
> @@ -492,6 +495,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
>  		start = bitmap_no + mask + 1;
>  	}
>  
> +	lru_cache_enable();
> +
>  	trace_cma_alloc_finish(cma->name, pfn, page, count, align);
>  
>  	/*
> diff --git a/mm/swap.c b/mm/swap.c
> index af3cad4e5378..5f89d7c9a54e 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -847,7 +847,17 @@ void lru_add_drain_all(void)
>  }
>  #endif /* CONFIG_SMP */
>  
> -atomic_t lru_disable_count = ATOMIC_INIT(0);
> +static atomic_t lru_disable_count = ATOMIC_INIT(0);
> +
> +bool lru_cache_disabled(void)
> +{
> +	return atomic_read(&lru_disable_count) != 0;
> +}
> +
> +void lru_cache_enable(void)
> +{
> +	atomic_dec(&lru_disable_count);
> +}
>  
>  /*
>   * lru_cache_disable() needs to be called before we start compiling
> @@ -859,7 +869,21 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
>   */
>  void lru_cache_disable(void)
>  {
> -	atomic_inc(&lru_disable_count);
> +	static DEFINE_MUTEX(lock);
> +
> +	/*
> +	 * The lock gaurantees lru_cache is drained when the function
> +	 * returned.
> +	 */
> +	mutex_lock(&lock);
> +	/*
> +	 * If someone is already disabled lru_cache, just return with
> +	 * increasing the lru_disable_count.
> +	 */
> +	if (atomic_inc_not_zero(&lru_disable_count)) {
> +		mutex_unlock(&lock);
> +		return;
> +	}
>  #ifdef CONFIG_SMP
>  	/*
>  	 * lru_add_drain_all in the force mode will schedule draining on
> @@ -873,6 +897,8 @@ void lru_cache_disable(void)
>  #else
>  	lru_add_and_bh_lrus_drain();
>  #endif
> +	atomic_inc(&lru_disable_count);
> +	mutex_unlock(&lock);
>  }
>  
>  /**
> -- 
> 2.34.1.448.ga2b2bfdf31-goog
Minchan Kim Jan. 19, 2022, 12:12 a.m. UTC | #3
On Mon, Jan 17, 2022 at 02:47:06PM +0100, Michal Hocko wrote:
> On Thu 30-12-21 11:36:27, Minchan Kim wrote:
> > lru_cache_disable involves IPIs to drain pagevec of each core,
> > which sometimes takes quite long time to complete depending
> > on cpu's business, which makes allocation too slow up to
> > sveral hundredth milliseconds. Furthermore, the repeated draining
> > in the alloc_contig_range makes thing worse considering caller
> > of alloc_contig_range usually tries multiple times in the loop.
> >
> > This patch makes the lru_cache_disable aware of the fact the
> > pagevec was already disabled. With that, user of alloc_contig_range
> > can disable the lru cache in advance in their context during the
> > repeated trial so they can avoid the multiple costly draining
> > in cma allocation.
> 
> Do you have any numbers on any improvements?

The LRU draining consumed above 50% overhead for the 20M CMA alloc.

> 
> Now to the change. I do not like this much to be honest. LRU cache
> disabling is a complex synchronization scheme implemented in
> __lru_add_drain_all now you are stacking another level on top of that.
> 
> More fundamentally though. I am not sure I understand the problem TBH.

The problem is that kinds of IPI using normal prority workqueue to drain
takes much time depending on the system CPU business.

> What prevents you from calling lru_cache_disable at the cma level in the
> first place?

You meant moving the call from alloc_contig_range to caller layer?
So, virtio_mem_fake_online, too? It could and make sense from
performance perspective since upper layer usually calls the
alloc_contig_range multiple times on retrial loop.

Havid said, semantically, not good in that why upper layer should
know how alloc_contig_range works(LRU disable is too low level stuff)
internally but I chose the performance here.

There is an example why the stacking is needed.
cma_alloc also can be called from outside.
A usecase is try to call

    lru_cache_disable
    for (order = 10; order >= 0; order) {
        page = cma_alloc(1<<order)
        if (page)
            break;
    }
    lru_cacne_enable

Here, putting the disable lru outside of cma_alloc is
much better than inside. That's why I put it outside.

> 
> > Signed-off-by: Minchan Kim <minchan@kernel.org>
> > ---
> >  * from v1 - https://lore.kernel.org/lkml/20211206221006.946661-1-minchan@kernel.org/
> >    * fix lru_cache_disable race - akpm
> > 
> >  include/linux/swap.h | 14 ++------------
> >  mm/cma.c             |  5 +++++
> >  mm/swap.c            | 30 ++++++++++++++++++++++++++++--
> >  3 files changed, 35 insertions(+), 14 deletions(-)
> > 
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index ba52f3a3478e..fe18e86a4f13 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -348,19 +348,9 @@ extern void lru_note_cost_page(struct page *);
> >  extern void lru_cache_add(struct page *);
> >  extern void mark_page_accessed(struct page *);
> >  
> > -extern atomic_t lru_disable_count;
> > -
> > -static inline bool lru_cache_disabled(void)
> > -{
> > -	return atomic_read(&lru_disable_count);
> > -}
> > -
> > -static inline void lru_cache_enable(void)
> > -{
> > -	atomic_dec(&lru_disable_count);
> > -}
> > -
> > +extern bool lru_cache_disabled(void);
> >  extern void lru_cache_disable(void);
> > +extern void lru_cache_enable(void);
> >  extern void lru_add_drain(void);
> >  extern void lru_add_drain_cpu(int cpu);
> >  extern void lru_add_drain_cpu_zone(struct zone *zone);
> > diff --git a/mm/cma.c b/mm/cma.c
> > index 995e15480937..60be555c5b95 100644
> > --- a/mm/cma.c
> > +++ b/mm/cma.c
> > @@ -30,6 +30,7 @@
> >  #include <linux/cma.h>
> >  #include <linux/highmem.h>
> >  #include <linux/io.h>
> > +#include <linux/swap.h>
> >  #include <linux/kmemleak.h>
> >  #include <trace/events/cma.h>
> >  
> > @@ -453,6 +454,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
> >  	if (bitmap_count > bitmap_maxno)
> >  		goto out;
> >  
> > +	lru_cache_disable();
> > +
> >  	for (;;) {
> >  		spin_lock_irq(&cma->lock);
> >  		bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
> > @@ -492,6 +495,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
> >  		start = bitmap_no + mask + 1;
> >  	}
> >  
> > +	lru_cache_enable();
> > +
> >  	trace_cma_alloc_finish(cma->name, pfn, page, count, align);
> >  
> >  	/*
> > diff --git a/mm/swap.c b/mm/swap.c
> > index af3cad4e5378..5f89d7c9a54e 100644
> > --- a/mm/swap.c
> > +++ b/mm/swap.c
> > @@ -847,7 +847,17 @@ void lru_add_drain_all(void)
> >  }
> >  #endif /* CONFIG_SMP */
> >  
> > -atomic_t lru_disable_count = ATOMIC_INIT(0);
> > +static atomic_t lru_disable_count = ATOMIC_INIT(0);
> > +
> > +bool lru_cache_disabled(void)
> > +{
> > +	return atomic_read(&lru_disable_count) != 0;
> > +}
> > +
> > +void lru_cache_enable(void)
> > +{
> > +	atomic_dec(&lru_disable_count);
> > +}
> >  
> >  /*
> >   * lru_cache_disable() needs to be called before we start compiling
> > @@ -859,7 +869,21 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
> >   */
> >  void lru_cache_disable(void)
> >  {
> > -	atomic_inc(&lru_disable_count);
> > +	static DEFINE_MUTEX(lock);
> > +
> > +	/*
> > +	 * The lock gaurantees lru_cache is drained when the function
> > +	 * returned.
> > +	 */
> > +	mutex_lock(&lock);
> > +	/*
> > +	 * If someone is already disabled lru_cache, just return with
> > +	 * increasing the lru_disable_count.
> > +	 */
> > +	if (atomic_inc_not_zero(&lru_disable_count)) {
> > +		mutex_unlock(&lock);
> > +		return;
> > +	}
> >  #ifdef CONFIG_SMP
> >  	/*
> >  	 * lru_add_drain_all in the force mode will schedule draining on
> > @@ -873,6 +897,8 @@ void lru_cache_disable(void)
> >  #else
> >  	lru_add_and_bh_lrus_drain();
> >  #endif
> > +	atomic_inc(&lru_disable_count);
> > +	mutex_unlock(&lock);
> >  }
> >  
> >  /**
> > -- 
> > 2.34.1.448.ga2b2bfdf31-goog
> 
> -- 
> Michal Hocko
> SUSE Labs
>
Michal Hocko Jan. 19, 2022, 9:20 a.m. UTC | #4
On Tue 18-01-22 16:12:54, Minchan Kim wrote:
> On Mon, Jan 17, 2022 at 02:47:06PM +0100, Michal Hocko wrote:
> > On Thu 30-12-21 11:36:27, Minchan Kim wrote:
> > > lru_cache_disable involves IPIs to drain pagevec of each core,
> > > which sometimes takes quite long time to complete depending
> > > on cpu's business, which makes allocation too slow up to
> > > sveral hundredth milliseconds. Furthermore, the repeated draining
> > > in the alloc_contig_range makes thing worse considering caller
> > > of alloc_contig_range usually tries multiple times in the loop.
> > >
> > > This patch makes the lru_cache_disable aware of the fact the
> > > pagevec was already disabled. With that, user of alloc_contig_range
> > > can disable the lru cache in advance in their context during the
> > > repeated trial so they can avoid the multiple costly draining
> > > in cma allocation.
> > 
> > Do you have any numbers on any improvements?
> 
> The LRU draining consumed above 50% overhead for the 20M CMA alloc.

This doesn't say much about the improvement itself.
 
> > Now to the change. I do not like this much to be honest. LRU cache
> > disabling is a complex synchronization scheme implemented in
> > __lru_add_drain_all now you are stacking another level on top of that.
> > 
> > More fundamentally though. I am not sure I understand the problem TBH.
> 
> The problem is that kinds of IPI using normal prority workqueue to drain
> takes much time depending on the system CPU business.

How does this patch address that problem? The IPI has to happen at some
point as we need to sync up with pcp caches.

> > What prevents you from calling lru_cache_disable at the cma level in the
> > first place?
> 
> You meant moving the call from alloc_contig_range to caller layer?

Yes.

> So, virtio_mem_fake_online, too? It could and make sense from
> performance perspective since upper layer usually calls the
> alloc_contig_range multiple times on retrial loop.
> 
> Havid said, semantically, not good in that why upper layer should
> know how alloc_contig_range works(LRU disable is too low level stuff)
> internally but I chose the performance here.
> 
> There is an example why the stacking is needed.
> cma_alloc also can be called from outside.
> A usecase is try to call
> 
>     lru_cache_disable
>     for (order = 10; order >= 0; order) {
>         page = cma_alloc(1<<order)
>         if (page)
>             break;
>     }
>     lru_cacne_enable
> 
> Here, putting the disable lru outside of cma_alloc is
> much better than inside. That's why I put it outside.

What does prevent you from calling lru_cache_{disable,enable} this way
with the existing implementation? AFAICS calls can be nested just fine.
Or am I missing something?
Minchan Kim Jan. 20, 2022, 4:25 a.m. UTC | #5
On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
> On Tue 18-01-22 16:12:54, Minchan Kim wrote:
> > On Mon, Jan 17, 2022 at 02:47:06PM +0100, Michal Hocko wrote:
> > > On Thu 30-12-21 11:36:27, Minchan Kim wrote:
> > > > lru_cache_disable involves IPIs to drain pagevec of each core,
> > > > which sometimes takes quite long time to complete depending
> > > > on cpu's business, which makes allocation too slow up to
> > > > sveral hundredth milliseconds. Furthermore, the repeated draining
> > > > in the alloc_contig_range makes thing worse considering caller
> > > > of alloc_contig_range usually tries multiple times in the loop.
> > > >
> > > > This patch makes the lru_cache_disable aware of the fact the
> > > > pagevec was already disabled. With that, user of alloc_contig_range
> > > > can disable the lru cache in advance in their context during the
> > > > repeated trial so they can avoid the multiple costly draining
> > > > in cma allocation.
> > > 
> > > Do you have any numbers on any improvements?
> > 
> > The LRU draining consumed above 50% overhead for the 20M CMA alloc.
> 
> This doesn't say much about the improvement itself.

The improvement is various depending on system state since it's
timing sensitive. Let me try to get it.

>  
> > > Now to the change. I do not like this much to be honest. LRU cache
> > > disabling is a complex synchronization scheme implemented in
> > > __lru_add_drain_all now you are stacking another level on top of that.
> > > 
> > > More fundamentally though. I am not sure I understand the problem TBH.
> > 
> > The problem is that kinds of IPI using normal prority workqueue to drain
> > takes much time depending on the system CPU business.
> 
> How does this patch address that problem? The IPI has to happen at some
> point as we need to sync up with pcp caches.

True but the goal is to minimize the IPI overhead. Pleas look at
alloc_contig_range and what happens if it fails with -EBUSY.
The caller usually try again and then alloc_contig_range calls
the drain again. In our workload, it keeps repeated until the
allocation succeeded and the IPI keeps calling. Totally wasted.

> 
> > > What prevents you from calling lru_cache_disable at the cma level in the
> > > first place?
> > 
> > You meant moving the call from alloc_contig_range to caller layer?
> 
> Yes.
> 
> > So, virtio_mem_fake_online, too? It could and make sense from
> > performance perspective since upper layer usually calls the
> > alloc_contig_range multiple times on retrial loop.
> > 
> > Havid said, semantically, not good in that why upper layer should
> > know how alloc_contig_range works(LRU disable is too low level stuff)
> > internally but I chose the performance here.
> > 
> > There is an example why the stacking is needed.
> > cma_alloc also can be called from outside.
> > A usecase is try to call
> > 
> >     lru_cache_disable
> >     for (order = 10; order >= 0; order) {
> >         page = cma_alloc(1<<order)
> >         if (page)
> >             break;
> >     }
> >     lru_cacne_enable
> > 
> > Here, putting the disable lru outside of cma_alloc is
> > much better than inside. That's why I put it outside.
> 
> What does prevent you from calling lru_cache_{disable,enable} this way
> with the existing implementation? AFAICS calls can be nested just fine.
> Or am I missing something?

It just increases more IPI calls since we drain the lru cache
both upper layer and lower layer. That's I'd like to avoid
in this patch. Just disable lru cache one time for entire
allocation path.
Michal Hocko Jan. 20, 2022, 8:24 a.m. UTC | #6
On Wed 19-01-22 20:25:54, Minchan Kim wrote:
> On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
[...]
> > What does prevent you from calling lru_cache_{disable,enable} this way
> > with the existing implementation? AFAICS calls can be nested just fine.
> > Or am I missing something?
> 
> It just increases more IPI calls since we drain the lru cache
> both upper layer and lower layer. That's I'd like to avoid
> in this patch. Just disable lru cache one time for entire
> allocation path.

I do not follow. Once you call lru_cache_disable at the higher level
then no new pages are going to be added to the pcp caches. At the same
time existing caches are flushed so the inner lru_cache_disable will not
trigger any new IPIs.
David Hildenbrand Jan. 20, 2022, 8:42 a.m. UTC | #7
On 19.01.22 01:12, Minchan Kim wrote:
> On Mon, Jan 17, 2022 at 02:47:06PM +0100, Michal Hocko wrote:
>> On Thu 30-12-21 11:36:27, Minchan Kim wrote:
>>> lru_cache_disable involves IPIs to drain pagevec of each core,
>>> which sometimes takes quite long time to complete depending
>>> on cpu's business, which makes allocation too slow up to
>>> sveral hundredth milliseconds. Furthermore, the repeated draining
>>> in the alloc_contig_range makes thing worse considering caller
>>> of alloc_contig_range usually tries multiple times in the loop.
>>>
>>> This patch makes the lru_cache_disable aware of the fact the
>>> pagevec was already disabled. With that, user of alloc_contig_range
>>> can disable the lru cache in advance in their context during the
>>> repeated trial so they can avoid the multiple costly draining
>>> in cma allocation.
>>
>> Do you have any numbers on any improvements?
> 
> The LRU draining consumed above 50% overhead for the 20M CMA alloc.
> 
>>
>> Now to the change. I do not like this much to be honest. LRU cache
>> disabling is a complex synchronization scheme implemented in
>> __lru_add_drain_all now you are stacking another level on top of that.
>>
>> More fundamentally though. I am not sure I understand the problem TBH.
> 
> The problem is that kinds of IPI using normal prority workqueue to drain
> takes much time depending on the system CPU business.
> 
>> What prevents you from calling lru_cache_disable at the cma level in the
>> first place?
> 
> You meant moving the call from alloc_contig_range to caller layer?
> So, virtio_mem_fake_online, too? It could and make sense from
> performance perspective since upper layer usually calls the
> alloc_contig_range multiple times on retrial loop.
> 

^ I actually do have something like that on my TODO list.

The issue is that we have demanding requirements for
alloc_contig_range(), discussed in the past for CMA bulk allocations:

(1) Fast, unreliable allocations

Fail fast and let caller continue with next allocation instead of
retrying. Try to not degrade system performance.

(2) Slow, reliable allocations

Retry as good as possible. Degrading system performance (e.g., disabling
lru) is acceptable.


virtio-mem is usually (2), although there could be some use cases where
we first want to try (1) -- unplug as much memory as we can fast -- to
then fallback to (2) -- unplug what remains.

CMA bulk allocations are (1). "Ordinary" CMA is mostly (2) I'd assume.
Minchan Kim Jan. 20, 2022, 9:07 p.m. UTC | #8
On Thu, Jan 20, 2022 at 09:24:22AM +0100, Michal Hocko wrote:
> On Wed 19-01-22 20:25:54, Minchan Kim wrote:
> > On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
> [...]
> > > What does prevent you from calling lru_cache_{disable,enable} this way
> > > with the existing implementation? AFAICS calls can be nested just fine.
> > > Or am I missing something?
> > 
> > It just increases more IPI calls since we drain the lru cache
> > both upper layer and lower layer. That's I'd like to avoid
> > in this patch. Just disable lru cache one time for entire
> > allocation path.
> 
> I do not follow. Once you call lru_cache_disable at the higher level
> then no new pages are going to be added to the pcp caches. At the same
> time existing caches are flushed so the inner lru_cache_disable will not
> trigger any new IPIs.

lru_cache_disable calls __lru_add_drain_all with force_all_cpus
unconditionally so keep calling the IPI.
Minchan Kim Jan. 20, 2022, 9:22 p.m. UTC | #9
On Thu, Jan 20, 2022 at 09:42:23AM +0100, David Hildenbrand wrote:
> On 19.01.22 01:12, Minchan Kim wrote:
> > On Mon, Jan 17, 2022 at 02:47:06PM +0100, Michal Hocko wrote:
> >> On Thu 30-12-21 11:36:27, Minchan Kim wrote:
> >>> lru_cache_disable involves IPIs to drain pagevec of each core,
> >>> which sometimes takes quite long time to complete depending
> >>> on cpu's business, which makes allocation too slow up to
> >>> sveral hundredth milliseconds. Furthermore, the repeated draining
> >>> in the alloc_contig_range makes thing worse considering caller
> >>> of alloc_contig_range usually tries multiple times in the loop.
> >>>
> >>> This patch makes the lru_cache_disable aware of the fact the
> >>> pagevec was already disabled. With that, user of alloc_contig_range
> >>> can disable the lru cache in advance in their context during the
> >>> repeated trial so they can avoid the multiple costly draining
> >>> in cma allocation.
> >>
> >> Do you have any numbers on any improvements?
> > 
> > The LRU draining consumed above 50% overhead for the 20M CMA alloc.
> > 
> >>
> >> Now to the change. I do not like this much to be honest. LRU cache
> >> disabling is a complex synchronization scheme implemented in
> >> __lru_add_drain_all now you are stacking another level on top of that.
> >>
> >> More fundamentally though. I am not sure I understand the problem TBH.
> > 
> > The problem is that kinds of IPI using normal prority workqueue to drain
> > takes much time depending on the system CPU business.
> > 
> >> What prevents you from calling lru_cache_disable at the cma level in the
> >> first place?
> > 
> > You meant moving the call from alloc_contig_range to caller layer?
> > So, virtio_mem_fake_online, too? It could and make sense from
> > performance perspective since upper layer usually calls the
> > alloc_contig_range multiple times on retrial loop.
> > 
> 
> ^ I actually do have something like that on my TODO list.

I'm glad to hear you are also looking the direction.
If we move some information out of alloc_contig_range,
upper layer has higher chance to make it fast. A example,
"failure pfn and why it fails"

> 
> The issue is that we have demanding requirements for
> alloc_contig_range(), discussed in the past for CMA bulk allocations:
> 
> (1) Fast, unreliable allocations
> 
> Fail fast and let caller continue with next allocation instead of
> retrying. Try to not degrade system performance.
> 
> (2) Slow, reliable allocations
> 
> Retry as good as possible. Degrading system performance (e.g., disabling
> lru) is acceptable.

"Fast and unreliable" provides lots of flexibilty for us to design
since it's okay to fail. For "Slow, reliable", "the slow" should be not
too much and we need to keep effort to make it faster.
Michal Hocko Jan. 21, 2022, 9:59 a.m. UTC | #10
On Thu 20-01-22 13:07:55, Minchan Kim wrote:
> On Thu, Jan 20, 2022 at 09:24:22AM +0100, Michal Hocko wrote:
> > On Wed 19-01-22 20:25:54, Minchan Kim wrote:
> > > On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
> > [...]
> > > > What does prevent you from calling lru_cache_{disable,enable} this way
> > > > with the existing implementation? AFAICS calls can be nested just fine.
> > > > Or am I missing something?
> > > 
> > > It just increases more IPI calls since we drain the lru cache
> > > both upper layer and lower layer. That's I'd like to avoid
> > > in this patch. Just disable lru cache one time for entire
> > > allocation path.
> > 
> > I do not follow. Once you call lru_cache_disable at the higher level
> > then no new pages are going to be added to the pcp caches. At the same
> > time existing caches are flushed so the inner lru_cache_disable will not
> > trigger any new IPIs.
> 
> lru_cache_disable calls __lru_add_drain_all with force_all_cpus
> unconditionally so keep calling the IPI.

OK, this is something I have missed. Why cannot we remove the force_all
mode for lru_disable_count>0 when there are no pcp caches populated?
Minchan Kim Jan. 21, 2022, 9:56 p.m. UTC | #11
On Fri, Jan 21, 2022 at 10:59:32AM +0100, Michal Hocko wrote:
> On Thu 20-01-22 13:07:55, Minchan Kim wrote:
> > On Thu, Jan 20, 2022 at 09:24:22AM +0100, Michal Hocko wrote:
> > > On Wed 19-01-22 20:25:54, Minchan Kim wrote:
> > > > On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
> > > [...]
> > > > > What does prevent you from calling lru_cache_{disable,enable} this way
> > > > > with the existing implementation? AFAICS calls can be nested just fine.
> > > > > Or am I missing something?
> > > > 
> > > > It just increases more IPI calls since we drain the lru cache
> > > > both upper layer and lower layer. That's I'd like to avoid
> > > > in this patch. Just disable lru cache one time for entire
> > > > allocation path.
> > > 
> > > I do not follow. Once you call lru_cache_disable at the higher level
> > > then no new pages are going to be added to the pcp caches. At the same
> > > time existing caches are flushed so the inner lru_cache_disable will not
> > > trigger any new IPIs.
> > 
> > lru_cache_disable calls __lru_add_drain_all with force_all_cpus
> > unconditionally so keep calling the IPI.
> 
> OK, this is something I have missed. Why cannot we remove the force_all
> mode for lru_disable_count>0 when there are no pcp caches populated?

Couldn't gaurantee whether the IPI is finished with only atomic counter.

CPU 0                               CPU 1
lru_cache_disable                   lru_cache_disable
  ret = atomic_inc_return
                                    
                                   ret = atomic_inc_return
  lru_add_drain_all(ret == 1);     lru_add_drain_all(ret == 1)
    IPI ongoing                    skip IPI
                                   alloc_contig_range
                                   fail
    ..
    ..

   IPI done
Michal Hocko Jan. 24, 2022, 9:57 a.m. UTC | #12
On Fri 21-01-22 13:56:31, Minchan Kim wrote:
> On Fri, Jan 21, 2022 at 10:59:32AM +0100, Michal Hocko wrote:
> > On Thu 20-01-22 13:07:55, Minchan Kim wrote:
> > > On Thu, Jan 20, 2022 at 09:24:22AM +0100, Michal Hocko wrote:
> > > > On Wed 19-01-22 20:25:54, Minchan Kim wrote:
> > > > > On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
> > > > [...]
> > > > > > What does prevent you from calling lru_cache_{disable,enable} this way
> > > > > > with the existing implementation? AFAICS calls can be nested just fine.
> > > > > > Or am I missing something?
> > > > > 
> > > > > It just increases more IPI calls since we drain the lru cache
> > > > > both upper layer and lower layer. That's I'd like to avoid
> > > > > in this patch. Just disable lru cache one time for entire
> > > > > allocation path.
> > > > 
> > > > I do not follow. Once you call lru_cache_disable at the higher level
> > > > then no new pages are going to be added to the pcp caches. At the same
> > > > time existing caches are flushed so the inner lru_cache_disable will not
> > > > trigger any new IPIs.
> > > 
> > > lru_cache_disable calls __lru_add_drain_all with force_all_cpus
> > > unconditionally so keep calling the IPI.
> > 
> > OK, this is something I have missed. Why cannot we remove the force_all
> > mode for lru_disable_count>0 when there are no pcp caches populated?
> 
> Couldn't gaurantee whether the IPI is finished with only atomic counter.
> 
> CPU 0                               CPU 1
> lru_cache_disable                   lru_cache_disable
>   ret = atomic_inc_return
>                                     
>                                    ret = atomic_inc_return
>   lru_add_drain_all(ret == 1);     lru_add_drain_all(ret == 1)
>     IPI ongoing                    skip IPI
>                                    alloc_contig_range
>                                    fail
>     ..
>     ..
> 
>    IPI done

But __lru_add_drain_all uses a local mutex while the IPI flushing is
done so the racing lru_cache_disable would block until
flush_work(&per_cpu(lru_add_drain_work, cpu)) completes so all IPIs are
handled. Or am I missing something?
Minchan Kim Jan. 24, 2022, 10:22 p.m. UTC | #13
On Mon, Jan 24, 2022 at 10:57:36AM +0100, Michal Hocko wrote:
> On Fri 21-01-22 13:56:31, Minchan Kim wrote:
> > On Fri, Jan 21, 2022 at 10:59:32AM +0100, Michal Hocko wrote:
> > > On Thu 20-01-22 13:07:55, Minchan Kim wrote:
> > > > On Thu, Jan 20, 2022 at 09:24:22AM +0100, Michal Hocko wrote:
> > > > > On Wed 19-01-22 20:25:54, Minchan Kim wrote:
> > > > > > On Wed, Jan 19, 2022 at 10:20:22AM +0100, Michal Hocko wrote:
> > > > > [...]
> > > > > > > What does prevent you from calling lru_cache_{disable,enable} this way
> > > > > > > with the existing implementation? AFAICS calls can be nested just fine.
> > > > > > > Or am I missing something?
> > > > > > 
> > > > > > It just increases more IPI calls since we drain the lru cache
> > > > > > both upper layer and lower layer. That's I'd like to avoid
> > > > > > in this patch. Just disable lru cache one time for entire
> > > > > > allocation path.
> > > > > 
> > > > > I do not follow. Once you call lru_cache_disable at the higher level
> > > > > then no new pages are going to be added to the pcp caches. At the same
> > > > > time existing caches are flushed so the inner lru_cache_disable will not
> > > > > trigger any new IPIs.
> > > > 
> > > > lru_cache_disable calls __lru_add_drain_all with force_all_cpus
> > > > unconditionally so keep calling the IPI.
> > > 
> > > OK, this is something I have missed. Why cannot we remove the force_all
> > > mode for lru_disable_count>0 when there are no pcp caches populated?
> > 
> > Couldn't gaurantee whether the IPI is finished with only atomic counter.
> > 
> > CPU 0                               CPU 1
> > lru_cache_disable                   lru_cache_disable
> >   ret = atomic_inc_return
> >                                     
> >                                    ret = atomic_inc_return
> >   lru_add_drain_all(ret == 1);     lru_add_drain_all(ret == 1)
> >     IPI ongoing                    skip IPI
> >                                    alloc_contig_range
> >                                    fail
> >     ..
> >     ..
> > 
> >    IPI done
> 
> But __lru_add_drain_all uses a local mutex while the IPI flushing is
> done so the racing lru_cache_disable would block until
> flush_work(&per_cpu(lru_add_drain_work, cpu)) completes so all IPIs are
> handled. Or am I missing something?

 CPU 0                               CPU 1

 lru_cache_disable                  lru_cache_disable
   ret = atomic_inc_return;(ret = 1)
                                     
                                    ret = atomic_inc_return;(ret = 2)
                                    
   lru_add_drain_all(true);         
                                    lru_add_drain_all(false)
                                    mutex_lock() is holding
   mutex_lock() is waiting

                                    IPI with !force_all_cpus
                                    ...
                                    ...
                                    IPI done but it skipped some CPUs
               
     ..
     ..
 

Thus, lru_cache_disable on CPU 1 doesn't run with every CPUs so it
introduces race of lru_disable_count so some pages on cores
which didn't run the IPI could accept upcoming pages into per-cpu
cache.
Michal Hocko Jan. 25, 2022, 9:23 a.m. UTC | #14
On Mon 24-01-22 14:22:03, Minchan Kim wrote:
[...]
>  CPU 0                               CPU 1
> 
>  lru_cache_disable                  lru_cache_disable
>    ret = atomic_inc_return;(ret = 1)
>                                      
>                                     ret = atomic_inc_return;(ret = 2)
>                                     
>    lru_add_drain_all(true);         
>                                     lru_add_drain_all(false)
>                                     mutex_lock() is holding
>    mutex_lock() is waiting
> 
>                                     IPI with !force_all_cpus
>                                     ...
>                                     ...
>                                     IPI done but it skipped some CPUs
>                
>      ..
>      ..
>  
> 
> Thus, lru_cache_disable on CPU 1 doesn't run with every CPUs so it
> introduces race of lru_disable_count so some pages on cores
> which didn't run the IPI could accept upcoming pages into per-cpu
> cache.

Yes, that is certainly possible but the question is whether it really
matters all that much. The race would require also another racer to be
adding a page to an _empty_ pcp list at the same time.

pagevec_add_and_need_flush
  1) pagevec_add # add to pcp list
  2) lru_cache_disabled
    atomic_read(lru_disable_count) = 0
  # no flush but the page is on pcp

There is no strong memory ordering between 1 and 2 and that is why we
need an IPI to enforce it in general IIRC.

But lru_cache_disable is not a strong synchronization primitive. It aims
at providing a best effort means to reduce false positives, right? IMHO
it doesn't make much sense to aim for perfection because all users of
this interface already have to live with temporary failures and pcp
caches is not the only reason to fail - e.g. short lived page pins.

That being said, I would rather live with a best effort and simpler
implementation approach rather than aim for perfection in this case.
The scheme is already quite complex and another lock in the mix doesn't
make it any easier to follow. If others believe that another lock makes
the implementation more straightforward I will not object but I would go
with the following.

diff --git a/mm/swap.c b/mm/swap.c
index ae8d56848602..c140c3743b9e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -922,7 +922,8 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
  */
 void lru_cache_disable(void)
 {
-	atomic_inc(&lru_disable_count);
+	int count = atomic_inc_return(&lru_disable_count);
+
 #ifdef CONFIG_SMP
 	/*
 	 * lru_add_drain_all in the force mode will schedule draining on
@@ -931,8 +932,28 @@ void lru_cache_disable(void)
 	 * The atomic operation doesn't need to have stronger ordering
 	 * requirements because that is enforeced by the scheduling
 	 * guarantees.
+	 * Please note that there is a potential for a race condition:
+	 * CPU0				CPU1			CPU2
+	 * pagevec_add_and_need_flush
+	 *   pagevec_add # to the empty list
+	 *   lru_cache_disabled
+	 *     atomic_read # 0
+	 *   				lru_cache_disable	lru_cache_disable
+	 *				  atomic_inc_return (1)
+	 *				  			  atomic_inc_return (2)
+	 *				  __lru_add_drain_all(true)
+	 *				  			  __lru_add_drain_all(false)
+	 *				  			    mutex_lock
+	 *				    mutex_lock
+	 *				    			    # skip cpu0 (pagevec_add not visible yet)
+	 *				    			    mutex_unlock
+	 *				    			 # fail because of pcp(0) pin
+	 *				    queue_work_on(0)
+	 *
+	 * but the scheme is a best effort and the above race quite unlikely
+	 * to matter in real life.
 	 */
-	__lru_add_drain_all(true);
+	__lru_add_drain_all(count == 1);
 #else
 	lru_add_and_bh_lrus_drain();
 #endif
Minchan Kim Jan. 25, 2022, 9:06 p.m. UTC | #15
On Tue, Jan 25, 2022 at 10:23:13AM +0100, Michal Hocko wrote:
> On Mon 24-01-22 14:22:03, Minchan Kim wrote:
> [...]
> >  CPU 0                               CPU 1
> > 
> >  lru_cache_disable                  lru_cache_disable
> >    ret = atomic_inc_return;(ret = 1)
> >                                      
> >                                     ret = atomic_inc_return;(ret = 2)
> >                                     
> >    lru_add_drain_all(true);         
> >                                     lru_add_drain_all(false)
> >                                     mutex_lock() is holding
> >    mutex_lock() is waiting
> > 
> >                                     IPI with !force_all_cpus
> >                                     ...
> >                                     ...
> >                                     IPI done but it skipped some CPUs
> >                
> >      ..
> >      ..
> >  
> > 
> > Thus, lru_cache_disable on CPU 1 doesn't run with every CPUs so it
> > introduces race of lru_disable_count so some pages on cores
> > which didn't run the IPI could accept upcoming pages into per-cpu
> > cache.
> 
> Yes, that is certainly possible but the question is whether it really
> matters all that much. The race would require also another racer to be
> adding a page to an _empty_ pcp list at the same time.
> 
> pagevec_add_and_need_flush
>   1) pagevec_add # add to pcp list
>   2) lru_cache_disabled
>     atomic_read(lru_disable_count) = 0
>   # no flush but the page is on pcp
> 
> There is no strong memory ordering between 1 and 2 and that is why we
> need an IPI to enforce it in general IIRC.

Correct.

> 
> But lru_cache_disable is not a strong synchronization primitive. It aims
> at providing a best effort means to reduce false positives, right? IMHO

Nope. d479960e44f27, mm: disable LRU pagevec during the migration temporarily

Originally, it was designed to close the race fundamentally.

> it doesn't make much sense to aim for perfection because all users of
> this interface already have to live with temporary failures and pcp
> caches is not the only reason to fail - e.g. short lived page pins.

short lived pages are true but that doesn't mean we need to make the
allocation faster. As I mentioned, the IPI takes up to hundreds
milliseconds easily once CPUs are fully booked. If we reduce the
cost, we could spend the time more productive works. I am working
on making CMA more determinstic and this patch is one of parts.

> 
> That being said, I would rather live with a best effort and simpler
> implementation approach rather than aim for perfection in this case.
> The scheme is already quite complex and another lock in the mix doesn't

lru_add_drain_all already hides the whole complexity inside and
lru_cache_disable adds A simple synchroniztion to keep ordering
on top of it. That's natural SW stack and I don't see too complication
here.

> make it any easier to follow. If others believe that another lock makes

Disagree. lru_cache_disable is designed to guarantee closing the race
you are opening again so the other code in allocator since disabling
per-cpu cache doesn't need to consider the race at all. It's more
simple/deterministic and we could make other stuff based on it(e.g.,
zone->pcp). 

> the implementation more straightforward I will not object but I would go
> with the following.
> 
> diff --git a/mm/swap.c b/mm/swap.c
> index ae8d56848602..c140c3743b9e 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -922,7 +922,8 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
>   */
>  void lru_cache_disable(void)
>  {
> -	atomic_inc(&lru_disable_count);
> +	int count = atomic_inc_return(&lru_disable_count);
> +
>  #ifdef CONFIG_SMP
>  	/*
>  	 * lru_add_drain_all in the force mode will schedule draining on
> @@ -931,8 +932,28 @@ void lru_cache_disable(void)
>  	 * The atomic operation doesn't need to have stronger ordering
>  	 * requirements because that is enforeced by the scheduling
>  	 * guarantees.
> +	 * Please note that there is a potential for a race condition:
> +	 * CPU0				CPU1			CPU2
> +	 * pagevec_add_and_need_flush
> +	 *   pagevec_add # to the empty list
> +	 *   lru_cache_disabled
> +	 *     atomic_read # 0
> +	 *   				lru_cache_disable	lru_cache_disable
> +	 *				  atomic_inc_return (1)
> +	 *				  			  atomic_inc_return (2)
> +	 *				  __lru_add_drain_all(true)
> +	 *				  			  __lru_add_drain_all(false)
> +	 *				  			    mutex_lock
> +	 *				    mutex_lock
> +	 *				    			    # skip cpu0 (pagevec_add not visible yet)
> +	 *				    			    mutex_unlock
> +	 *				    			 # fail because of pcp(0) pin
> +	 *				    queue_work_on(0)
> +	 *
> +	 * but the scheme is a best effort and the above race quite unlikely
> +	 * to matter in real life.
>  	 */
> -	__lru_add_drain_all(true);
> +	__lru_add_drain_all(count == 1);
>  #else
>  	lru_add_and_bh_lrus_drain();
>  #endif
> -- 
> Michal Hocko
> SUSE Labs
Michal Hocko Jan. 26, 2022, 12:09 p.m. UTC | #16
On Tue 25-01-22 13:06:17, Minchan Kim wrote:
> On Tue, Jan 25, 2022 at 10:23:13AM +0100, Michal Hocko wrote:
> > On Mon 24-01-22 14:22:03, Minchan Kim wrote:
> > [...]
> > >  CPU 0                               CPU 1
> > > 
> > >  lru_cache_disable                  lru_cache_disable
> > >    ret = atomic_inc_return;(ret = 1)
> > >                                      
> > >                                     ret = atomic_inc_return;(ret = 2)
> > >                                     
> > >    lru_add_drain_all(true);         
> > >                                     lru_add_drain_all(false)
> > >                                     mutex_lock() is holding
> > >    mutex_lock() is waiting
> > > 
> > >                                     IPI with !force_all_cpus
> > >                                     ...
> > >                                     ...
> > >                                     IPI done but it skipped some CPUs
> > >                
> > >      ..
> > >      ..
> > >  
> > > 
> > > Thus, lru_cache_disable on CPU 1 doesn't run with every CPUs so it
> > > introduces race of lru_disable_count so some pages on cores
> > > which didn't run the IPI could accept upcoming pages into per-cpu
> > > cache.
> > 
> > Yes, that is certainly possible but the question is whether it really
> > matters all that much. The race would require also another racer to be
> > adding a page to an _empty_ pcp list at the same time.
> > 
> > pagevec_add_and_need_flush
> >   1) pagevec_add # add to pcp list
> >   2) lru_cache_disabled
> >     atomic_read(lru_disable_count) = 0
> >   # no flush but the page is on pcp
> > 
> > There is no strong memory ordering between 1 and 2 and that is why we
> > need an IPI to enforce it in general IIRC.
> 
> Correct.
> 
> > 
> > But lru_cache_disable is not a strong synchronization primitive. It aims
> > at providing a best effort means to reduce false positives, right? IMHO
> 
> Nope. d479960e44f27, mm: disable LRU pagevec during the migration temporarily
> 
> Originally, it was designed to close the race fundamentally.

yes, but that turned out to be expensive due to additional IPIs.

> > it doesn't make much sense to aim for perfection because all users of
> > this interface already have to live with temporary failures and pcp
> > caches is not the only reason to fail - e.g. short lived page pins.
> 
> short lived pages are true but that doesn't mean we need to make the
> allocation faster. As I mentioned, the IPI takes up to hundreds
> milliseconds easily once CPUs are fully booked. If we reduce the
> cost, we could spend the time more productive works. I am working
> on making CMA more determinstic and this patch is one of parts.
> 
> > 
> > That being said, I would rather live with a best effort and simpler
> > implementation approach rather than aim for perfection in this case.
> > The scheme is already quite complex and another lock in the mix doesn't
> 
> lru_add_drain_all already hides the whole complexity inside and
> lru_cache_disable adds A simple synchroniztion to keep ordering
> on top of it. That's natural SW stack and I don't see too complication
> here.
> 
> > make it any easier to follow. If others believe that another lock makes
> 
> Disagree. lru_cache_disable is designed to guarantee closing the race
> you are opening again so the other code in allocator since disabling
> per-cpu cache doesn't need to consider the race at all. It's more
> simple/deterministic and we could make other stuff based on it(e.g.,
> zone->pcp). 

Let me repeat. The race is theoretically possible but I strongly doubt
it happens enough to warrant a more complexity to the scheme. Flush on
first lru_cache_disable sounds like a reasonable optimization to the
existing scheme without adding a ton of complexity. Adding another lock
to the picture sounds like over engineering the already complex scheme.

If you strongly disagree with that then I guess we have to agree to
disagree. I will not nak your patch but please consider practical aspect
of this whole schme. It really doesn't make much sense to make one part
of the potential failure mode absolutely bullet proof when more likely
temporary page pins are still going to be present.
diff mbox series

Patch

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ba52f3a3478e..fe18e86a4f13 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -348,19 +348,9 @@  extern void lru_note_cost_page(struct page *);
 extern void lru_cache_add(struct page *);
 extern void mark_page_accessed(struct page *);
 
-extern atomic_t lru_disable_count;
-
-static inline bool lru_cache_disabled(void)
-{
-	return atomic_read(&lru_disable_count);
-}
-
-static inline void lru_cache_enable(void)
-{
-	atomic_dec(&lru_disable_count);
-}
-
+extern bool lru_cache_disabled(void);
 extern void lru_cache_disable(void);
+extern void lru_cache_enable(void);
 extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
diff --git a/mm/cma.c b/mm/cma.c
index 995e15480937..60be555c5b95 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -30,6 +30,7 @@ 
 #include <linux/cma.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
+#include <linux/swap.h>
 #include <linux/kmemleak.h>
 #include <trace/events/cma.h>
 
@@ -453,6 +454,8 @@  struct page *cma_alloc(struct cma *cma, unsigned long count,
 	if (bitmap_count > bitmap_maxno)
 		goto out;
 
+	lru_cache_disable();
+
 	for (;;) {
 		spin_lock_irq(&cma->lock);
 		bitmap_no = bitmap_find_next_zero_area_off(cma->bitmap,
@@ -492,6 +495,8 @@  struct page *cma_alloc(struct cma *cma, unsigned long count,
 		start = bitmap_no + mask + 1;
 	}
 
+	lru_cache_enable();
+
 	trace_cma_alloc_finish(cma->name, pfn, page, count, align);
 
 	/*
diff --git a/mm/swap.c b/mm/swap.c
index af3cad4e5378..5f89d7c9a54e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -847,7 +847,17 @@  void lru_add_drain_all(void)
 }
 #endif /* CONFIG_SMP */
 
-atomic_t lru_disable_count = ATOMIC_INIT(0);
+static atomic_t lru_disable_count = ATOMIC_INIT(0);
+
+bool lru_cache_disabled(void)
+{
+	return atomic_read(&lru_disable_count) != 0;
+}
+
+void lru_cache_enable(void)
+{
+	atomic_dec(&lru_disable_count);
+}
 
 /*
  * lru_cache_disable() needs to be called before we start compiling
@@ -859,7 +869,21 @@  atomic_t lru_disable_count = ATOMIC_INIT(0);
  */
 void lru_cache_disable(void)
 {
-	atomic_inc(&lru_disable_count);
+	static DEFINE_MUTEX(lock);
+
+	/*
+	 * The lock gaurantees lru_cache is drained when the function
+	 * returned.
+	 */
+	mutex_lock(&lock);
+	/*
+	 * If someone is already disabled lru_cache, just return with
+	 * increasing the lru_disable_count.
+	 */
+	if (atomic_inc_not_zero(&lru_disable_count)) {
+		mutex_unlock(&lock);
+		return;
+	}
 #ifdef CONFIG_SMP
 	/*
 	 * lru_add_drain_all in the force mode will schedule draining on
@@ -873,6 +897,8 @@  void lru_cache_disable(void)
 #else
 	lru_add_and_bh_lrus_drain();
 #endif
+	atomic_inc(&lru_disable_count);
+	mutex_unlock(&lock);
 }
 
 /**