diff mbox series

[v3,2/4] mm/vmap: preload a CPU with one object for split purpose

Message ID 20190527093842.10701-3-urezki@gmail.com (mailing list archive)
State New, archived
Headers show
Series Some cleanups for the KVA/vmalloc | expand

Commit Message

Uladzislau Rezki May 27, 2019, 9:38 a.m. UTC
Refactor the NE_FIT_TYPE split case when it comes to an
allocation of one extra object. We need it in order to
build a remaining space.

Introduce ne_fit_preload()/ne_fit_preload_end() functions
for preloading one extra vmap_area object to ensure that
we have it available when fit type is NE_FIT_TYPE.

The preload is done per CPU in non-atomic context thus with
GFP_KERNEL allocation masks. More permissive parameters can
be beneficial for systems which are suffer from high memory
pressure or low memory condition.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
---
 mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 76 insertions(+), 3 deletions(-)

Comments

Roman Gushchin May 28, 2019, 10:42 p.m. UTC | #1
On Mon, May 27, 2019 at 11:38:40AM +0200, Uladzislau Rezki (Sony) wrote:
> Refactor the NE_FIT_TYPE split case when it comes to an
> allocation of one extra object. We need it in order to
> build a remaining space.
> 
> Introduce ne_fit_preload()/ne_fit_preload_end() functions
> for preloading one extra vmap_area object to ensure that
> we have it available when fit type is NE_FIT_TYPE.
> 
> The preload is done per CPU in non-atomic context thus with
> GFP_KERNEL allocation masks. More permissive parameters can
> be beneficial for systems which are suffer from high memory
> pressure or low memory condition.
> 
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> ---
>  mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 76 insertions(+), 3 deletions(-)

Hi Uladzislau!

This patch generally looks good to me (see some nits below),
but it would be really great to add some motivation, e.g. numbers.

Thanks!

> 
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index ea1b65fac599..b553047aa05b 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -364,6 +364,13 @@ static LIST_HEAD(free_vmap_area_list);
>   */
>  static struct rb_root free_vmap_area_root = RB_ROOT;
>  
> +/*
> + * Preload a CPU with one object for "no edge" split case. The
> + * aim is to get rid of allocations from the atomic context, thus
> + * to use more permissive allocation masks.
> + */
> +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
> +
>  static __always_inline unsigned long
>  va_size(struct vmap_area *va)
>  {
> @@ -950,9 +957,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
>  		 *   L V  NVA  V R
>  		 * |---|-------|---|
>  		 */
> -		lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> -		if (unlikely(!lva))
> -			return -1;
> +		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
> +		if (unlikely(!lva)) {
> +			/*
> +			 * For percpu allocator we do not do any pre-allocation
> +			 * and leave it as it is. The reason is it most likely
> +			 * never ends up with NE_FIT_TYPE splitting. In case of
> +			 * percpu allocations offsets and sizes are aligned to
> +			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
> +			 * are its main fitting cases.
> +			 *
> +			 * There are a few exceptions though, as an example it is
> +			 * a first allocation (early boot up) when we have "one"
> +			 * big free space that has to be split.
> +			 */
> +			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> +			if (!lva)
> +				return -1;
> +		}
>  
>  		/*
>  		 * Build the remainder.
> @@ -1023,6 +1045,48 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
>  }
>  
>  /*
> + * Preload this CPU with one extra vmap_area object to ensure
> + * that we have it available when fit type of free area is
> + * NE_FIT_TYPE.
> + *
> + * The preload is done in non-atomic context, thus it allows us
> + * to use more permissive allocation masks to be more stable under
> + * low memory condition and high memory pressure.
> + *
> + * If success it returns 1 with preemption disabled. In case
> + * of error 0 is returned with preemption not disabled. Note it
> + * has to be paired with ne_fit_preload_end().
> + */
> +static int

Cosmetic nit: you don't need a new line here.

> +ne_fit_preload(int nid)

> +{
> +	preempt_disable();
> +
> +	if (!__this_cpu_read(ne_fit_preload_node)) {
> +		struct vmap_area *node;
> +
> +		preempt_enable();
> +		node = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, nid);
> +		if (node == NULL)
> +			return 0;
> +
> +		preempt_disable();
> +
> +		if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, node))
> +			kmem_cache_free(vmap_area_cachep, node);
> +	}
> +
> +	return 1;
> +}
> +
> +static void

Here too.

> +ne_fit_preload_end(int preloaded)
> +{
> +	if (preloaded)
> +		preempt_enable();
> +}

I'd open code it. It's used only once, but hiding preempt_disable()
behind a helper makes it harder to understand and easier to mess.

Then ne_fit_preload() might require disabled preemption (which it can
temporarily re-enable), so that preempt_enable()/disable() logic
will be in one place.

> +
> +/*
>   * Allocate a region of KVA of the specified size and alignment, within the
>   * vstart and vend.
>   */
> @@ -1034,6 +1098,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	struct vmap_area *va;
>  	unsigned long addr;
>  	int purged = 0;
> +	int preloaded;
>  
>  	BUG_ON(!size);
>  	BUG_ON(offset_in_page(size));
> @@ -1056,6 +1121,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
>  
>  retry:
> +	/*
> +	 * Even if it fails we do not really care about that.
> +	 * Just proceed as it is. "overflow" path will refill
> +	 * the cache we allocate from.
> +	 */
> +	preloaded = ne_fit_preload(node);
>  	spin_lock(&vmap_area_lock);
>  
>  	/*
> @@ -1063,6 +1134,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>  	 * returned. Therefore trigger the overflow path.
>  	 */
>  	addr = __alloc_vmap_area(size, align, vstart, vend);
> +	ne_fit_preload_end(preloaded);
> +
>  	if (unlikely(addr == vend))
>  		goto overflow;
>  
> -- 
> 2.11.0
>
Uladzislau Rezki May 29, 2019, 2:27 p.m. UTC | #2
Hello, Roman!

> On Mon, May 27, 2019 at 11:38:40AM +0200, Uladzislau Rezki (Sony) wrote:
> > Refactor the NE_FIT_TYPE split case when it comes to an
> > allocation of one extra object. We need it in order to
> > build a remaining space.
> > 
> > Introduce ne_fit_preload()/ne_fit_preload_end() functions
> > for preloading one extra vmap_area object to ensure that
> > we have it available when fit type is NE_FIT_TYPE.
> > 
> > The preload is done per CPU in non-atomic context thus with
> > GFP_KERNEL allocation masks. More permissive parameters can
> > be beneficial for systems which are suffer from high memory
> > pressure or low memory condition.
> > 
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > ---
> >  mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  1 file changed, 76 insertions(+), 3 deletions(-)
> 
> Hi Uladzislau!
> 
> This patch generally looks good to me (see some nits below),
> but it would be really great to add some motivation, e.g. numbers.
> 
The main goal of this patch to get rid of using GFP_NOWAIT since it is
more restricted due to allocation from atomic context. IMHO, if we can
avoid of using it that is a right way to go.

From the other hand, as i mentioned before i have not seen any issues
with that on all my test systems during big rework. But it could be
beneficial for tiny systems where we do not have any swap and are
limited in memory size.

> > 
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index ea1b65fac599..b553047aa05b 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -364,6 +364,13 @@ static LIST_HEAD(free_vmap_area_list);
> >   */
> >  static struct rb_root free_vmap_area_root = RB_ROOT;
> >  
> > +/*
> > + * Preload a CPU with one object for "no edge" split case. The
> > + * aim is to get rid of allocations from the atomic context, thus
> > + * to use more permissive allocation masks.
> > + */
> > +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
> > +
> >  static __always_inline unsigned long
> >  va_size(struct vmap_area *va)
> >  {
> > @@ -950,9 +957,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
> >  		 *   L V  NVA  V R
> >  		 * |---|-------|---|
> >  		 */
> > -		lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> > -		if (unlikely(!lva))
> > -			return -1;
> > +		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
> > +		if (unlikely(!lva)) {
> > +			/*
> > +			 * For percpu allocator we do not do any pre-allocation
> > +			 * and leave it as it is. The reason is it most likely
> > +			 * never ends up with NE_FIT_TYPE splitting. In case of
> > +			 * percpu allocations offsets and sizes are aligned to
> > +			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
> > +			 * are its main fitting cases.
> > +			 *
> > +			 * There are a few exceptions though, as an example it is
> > +			 * a first allocation (early boot up) when we have "one"
> > +			 * big free space that has to be split.
> > +			 */
> > +			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> > +			if (!lva)
> > +				return -1;
> > +		}
> >  
> >  		/*
> >  		 * Build the remainder.
> > @@ -1023,6 +1045,48 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
> >  }
> >  
> >  /*
> > + * Preload this CPU with one extra vmap_area object to ensure
> > + * that we have it available when fit type of free area is
> > + * NE_FIT_TYPE.
> > + *
> > + * The preload is done in non-atomic context, thus it allows us
> > + * to use more permissive allocation masks to be more stable under
> > + * low memory condition and high memory pressure.
> > + *
> > + * If success it returns 1 with preemption disabled. In case
> > + * of error 0 is returned with preemption not disabled. Note it
> > + * has to be paired with ne_fit_preload_end().
> > + */
> > +static int
> 
> Cosmetic nit: you don't need a new line here.
> 
> > +ne_fit_preload(int nid)
> 
I can fix that.

> > +{
> > +	preempt_disable();
> > +
> > +	if (!__this_cpu_read(ne_fit_preload_node)) {
> > +		struct vmap_area *node;
> > +
> > +		preempt_enable();
> > +		node = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, nid);
> > +		if (node == NULL)
> > +			return 0;
> > +
> > +		preempt_disable();
> > +
> > +		if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, node))
> > +			kmem_cache_free(vmap_area_cachep, node);
> > +	}
> > +
> > +	return 1;
> > +}
> > +
> > +static void
> 
> Here too.
> 
> > +ne_fit_preload_end(int preloaded)
> > +{
> > +	if (preloaded)
> > +		preempt_enable();
> > +}
I can fix that.

> 
> I'd open code it. It's used only once, but hiding preempt_disable()
> behind a helper makes it harder to understand and easier to mess.
> 
> Then ne_fit_preload() might require disabled preemption (which it can
> temporarily re-enable), so that preempt_enable()/disable() logic
> will be in one place.
> 
I see your point. One of the aim was to make less clogged the
alloc_vmap_area() function. But we can refactor it like you say:

<snip>
 static void
@@ -1091,7 +1089,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
                                unsigned long vstart, unsigned long vend,
                                int node, gfp_t gfp_mask)
 {
-       struct vmap_area *va;
+       struct vmap_area *va, *pva;
        unsigned long addr;
        int purged = 0;
        int preloaded;
@@ -1122,16 +1120,26 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
         * Just proceed as it is. "overflow" path will refill
         * the cache we allocate from.
         */
-       ne_fit_preload(&preloaded);
+       preempt_disable();
+       if (!__this_cpu_read(ne_fit_preload_node)) {
+               preempt_enable();
+               pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
+               preempt_disable();
+
+               if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
+                       if (pva)
+                               kmem_cache_free(vmap_area_cachep, pva);
+               }
+       }
+
        spin_lock(&vmap_area_lock);
+       preempt_enable();
 
        /*
         * If an allocation fails, the "vend" address is
         * returned. Therefore trigger the overflow path.
         */
        addr = __alloc_vmap_area(size, align, vstart, vend);
-       ne_fit_preload_end(preloaded);
-
        if (unlikely(addr == vend))
                goto overflow;
<snip>

Do you mean something like that? If so, i can go with that, unless there are no
any objections from others.

Thank you for your comments, Roman!

--
Vlad Rezki
Roman Gushchin May 29, 2019, 4:34 p.m. UTC | #3
On Wed, May 29, 2019 at 04:27:15PM +0200, Uladzislau Rezki wrote:
> Hello, Roman!
> 
> > On Mon, May 27, 2019 at 11:38:40AM +0200, Uladzislau Rezki (Sony) wrote:
> > > Refactor the NE_FIT_TYPE split case when it comes to an
> > > allocation of one extra object. We need it in order to
> > > build a remaining space.
> > > 
> > > Introduce ne_fit_preload()/ne_fit_preload_end() functions
> > > for preloading one extra vmap_area object to ensure that
> > > we have it available when fit type is NE_FIT_TYPE.
> > > 
> > > The preload is done per CPU in non-atomic context thus with
> > > GFP_KERNEL allocation masks. More permissive parameters can
> > > be beneficial for systems which are suffer from high memory
> > > pressure or low memory condition.
> > > 
> > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > ---
> > >  mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> > >  1 file changed, 76 insertions(+), 3 deletions(-)
> > 
> > Hi Uladzislau!
> > 
> > This patch generally looks good to me (see some nits below),
> > but it would be really great to add some motivation, e.g. numbers.
> > 
> The main goal of this patch to get rid of using GFP_NOWAIT since it is
> more restricted due to allocation from atomic context. IMHO, if we can
> avoid of using it that is a right way to go.
> 
> From the other hand, as i mentioned before i have not seen any issues
> with that on all my test systems during big rework. But it could be
> beneficial for tiny systems where we do not have any swap and are
> limited in memory size.

Ok, that makes sense to me. Is it possible to emulate such a tiny system
on kvm and measure the benefits? Again, not a strong opinion here,
but it will be easier to justify adding a good chunk of code.

> 
> > > 
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index ea1b65fac599..b553047aa05b 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -364,6 +364,13 @@ static LIST_HEAD(free_vmap_area_list);
> > >   */
> > >  static struct rb_root free_vmap_area_root = RB_ROOT;
> > >  
> > > +/*
> > > + * Preload a CPU with one object for "no edge" split case. The
> > > + * aim is to get rid of allocations from the atomic context, thus
> > > + * to use more permissive allocation masks.
> > > + */
> > > +static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
> > > +
> > >  static __always_inline unsigned long
> > >  va_size(struct vmap_area *va)
> > >  {
> > > @@ -950,9 +957,24 @@ adjust_va_to_fit_type(struct vmap_area *va,
> > >  		 *   L V  NVA  V R
> > >  		 * |---|-------|---|
> > >  		 */
> > > -		lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> > > -		if (unlikely(!lva))
> > > -			return -1;
> > > +		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
> > > +		if (unlikely(!lva)) {
> > > +			/*
> > > +			 * For percpu allocator we do not do any pre-allocation
> > > +			 * and leave it as it is. The reason is it most likely
> > > +			 * never ends up with NE_FIT_TYPE splitting. In case of
> > > +			 * percpu allocations offsets and sizes are aligned to
> > > +			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
> > > +			 * are its main fitting cases.
> > > +			 *
> > > +			 * There are a few exceptions though, as an example it is
> > > +			 * a first allocation (early boot up) when we have "one"
> > > +			 * big free space that has to be split.
> > > +			 */
> > > +			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> > > +			if (!lva)
> > > +				return -1;
> > > +		}
> > >  
> > >  		/*
> > >  		 * Build the remainder.
> > > @@ -1023,6 +1045,48 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
> > >  }
> > >  
> > >  /*
> > > + * Preload this CPU with one extra vmap_area object to ensure
> > > + * that we have it available when fit type of free area is
> > > + * NE_FIT_TYPE.
> > > + *
> > > + * The preload is done in non-atomic context, thus it allows us
> > > + * to use more permissive allocation masks to be more stable under
> > > + * low memory condition and high memory pressure.
> > > + *
> > > + * If success it returns 1 with preemption disabled. In case
> > > + * of error 0 is returned with preemption not disabled. Note it
> > > + * has to be paired with ne_fit_preload_end().
> > > + */
> > > +static int
> > 
> > Cosmetic nit: you don't need a new line here.
> > 
> > > +ne_fit_preload(int nid)
> > 
> I can fix that.
> 
> > > +{
> > > +	preempt_disable();
> > > +
> > > +	if (!__this_cpu_read(ne_fit_preload_node)) {
> > > +		struct vmap_area *node;
> > > +
> > > +		preempt_enable();
> > > +		node = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, nid);
> > > +		if (node == NULL)
> > > +			return 0;
> > > +
> > > +		preempt_disable();
> > > +
> > > +		if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, node))
> > > +			kmem_cache_free(vmap_area_cachep, node);
> > > +	}
> > > +
> > > +	return 1;
> > > +}
> > > +
> > > +static void
> > 
> > Here too.
> > 
> > > +ne_fit_preload_end(int preloaded)
> > > +{
> > > +	if (preloaded)
> > > +		preempt_enable();
> > > +}
> I can fix that.
> 
> > 
> > I'd open code it. It's used only once, but hiding preempt_disable()
> > behind a helper makes it harder to understand and easier to mess.
> > 
> > Then ne_fit_preload() might require disabled preemption (which it can
> > temporarily re-enable), so that preempt_enable()/disable() logic
> > will be in one place.
> > 
> I see your point. One of the aim was to make less clogged the
> alloc_vmap_area() function. But we can refactor it like you say:
> 
> <snip>
>  static void
> @@ -1091,7 +1089,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>                                 unsigned long vstart, unsigned long vend,
>                                 int node, gfp_t gfp_mask)
>  {
> -       struct vmap_area *va;
> +       struct vmap_area *va, *pva;
>         unsigned long addr;
>         int purged = 0;
>         int preloaded;
> @@ -1122,16 +1120,26 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>          * Just proceed as it is. "overflow" path will refill
>          * the cache we allocate from.
>          */
> -       ne_fit_preload(&preloaded);
> +       preempt_disable();
> +       if (!__this_cpu_read(ne_fit_preload_node)) {
> +               preempt_enable();
> +               pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
> +               preempt_disable();
> +
> +               if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
> +                       if (pva)
> +                               kmem_cache_free(vmap_area_cachep, pva);
> +               }
> +       }
> +
>         spin_lock(&vmap_area_lock);
> +       preempt_enable();
>  
>         /*
>          * If an allocation fails, the "vend" address is
>          * returned. Therefore trigger the overflow path.
>          */
>         addr = __alloc_vmap_area(size, align, vstart, vend);
> -       ne_fit_preload_end(preloaded);
> -
>         if (unlikely(addr == vend))
>                 goto overflow;
> <snip>
> 
> Do you mean something like that? If so, i can go with that, unless there are no
> any objections from others.

Yes, it looks much better to me!

Thank you!
Uladzislau Rezki June 3, 2019, 5:53 p.m. UTC | #4
Hello, Roman!

On Wed, May 29, 2019 at 04:34:40PM +0000, Roman Gushchin wrote:
> On Wed, May 29, 2019 at 04:27:15PM +0200, Uladzislau Rezki wrote:
> > Hello, Roman!
> > 
> > > On Mon, May 27, 2019 at 11:38:40AM +0200, Uladzislau Rezki (Sony) wrote:
> > > > Refactor the NE_FIT_TYPE split case when it comes to an
> > > > allocation of one extra object. We need it in order to
> > > > build a remaining space.
> > > > 
> > > > Introduce ne_fit_preload()/ne_fit_preload_end() functions
> > > > for preloading one extra vmap_area object to ensure that
> > > > we have it available when fit type is NE_FIT_TYPE.
> > > > 
> > > > The preload is done per CPU in non-atomic context thus with
> > > > GFP_KERNEL allocation masks. More permissive parameters can
> > > > be beneficial for systems which are suffer from high memory
> > > > pressure or low memory condition.
> > > > 
> > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > ---
> > > >  mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> > > >  1 file changed, 76 insertions(+), 3 deletions(-)
> > > 
> > > Hi Uladzislau!
> > > 
> > > This patch generally looks good to me (see some nits below),
> > > but it would be really great to add some motivation, e.g. numbers.
> > > 
> > The main goal of this patch to get rid of using GFP_NOWAIT since it is
> > more restricted due to allocation from atomic context. IMHO, if we can
> > avoid of using it that is a right way to go.
> > 
> > From the other hand, as i mentioned before i have not seen any issues
> > with that on all my test systems during big rework. But it could be
> > beneficial for tiny systems where we do not have any swap and are
> > limited in memory size.
> 
> Ok, that makes sense to me. Is it possible to emulate such a tiny system
> on kvm and measure the benefits? Again, not a strong opinion here,
> but it will be easier to justify adding a good chunk of code.
> 
It seems it is not so straightforward as it looks like. I tried it before,
but usually the systems gets panic due to out of memory or just invokes
the OOM killer.

I will upload a new version of it, where i embed "preloading" logic directly
into alloc_vmap_area() function.

Thanks.

--
Vlad Rezki
Uladzislau Rezki June 3, 2019, 8:53 p.m. UTC | #5
On Mon, Jun 03, 2019 at 07:53:12PM +0200, Uladzislau Rezki wrote:
> Hello, Roman!
> 
> On Wed, May 29, 2019 at 04:34:40PM +0000, Roman Gushchin wrote:
> > On Wed, May 29, 2019 at 04:27:15PM +0200, Uladzislau Rezki wrote:
> > > Hello, Roman!
> > > 
> > > > On Mon, May 27, 2019 at 11:38:40AM +0200, Uladzislau Rezki (Sony) wrote:
> > > > > Refactor the NE_FIT_TYPE split case when it comes to an
> > > > > allocation of one extra object. We need it in order to
> > > > > build a remaining space.
> > > > > 
> > > > > Introduce ne_fit_preload()/ne_fit_preload_end() functions
> > > > > for preloading one extra vmap_area object to ensure that
> > > > > we have it available when fit type is NE_FIT_TYPE.
> > > > > 
> > > > > The preload is done per CPU in non-atomic context thus with
> > > > > GFP_KERNEL allocation masks. More permissive parameters can
> > > > > be beneficial for systems which are suffer from high memory
> > > > > pressure or low memory condition.
> > > > > 
> > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > ---
> > > > >  mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> > > > >  1 file changed, 76 insertions(+), 3 deletions(-)
> > > > 
> > > > Hi Uladzislau!
> > > > 
> > > > This patch generally looks good to me (see some nits below),
> > > > but it would be really great to add some motivation, e.g. numbers.
> > > > 
> > > The main goal of this patch to get rid of using GFP_NOWAIT since it is
> > > more restricted due to allocation from atomic context. IMHO, if we can
> > > avoid of using it that is a right way to go.
> > > 
> > > From the other hand, as i mentioned before i have not seen any issues
> > > with that on all my test systems during big rework. But it could be
> > > beneficial for tiny systems where we do not have any swap and are
> > > limited in memory size.
> > 
> > Ok, that makes sense to me. Is it possible to emulate such a tiny system
> > on kvm and measure the benefits? Again, not a strong opinion here,
> > but it will be easier to justify adding a good chunk of code.
> > 
> It seems it is not so straightforward as it looks like. I tried it before,
> but usually the systems gets panic due to out of memory or just invokes
> the OOM killer.
> 
> I will upload a new version of it, where i embed "preloading" logic directly
> into alloc_vmap_area() function.
> 
just managed to simulate the faulty behavior of GFP_NOWAIT restriction,
resulting to failure of vmalloc allocation. Under heavy load and low
memory condition and without swap, i can trigger below warning on my
KVM machine:

<snip>
[  366.910037] Out of memory: Killed process 470 (bash) total-vm:21012kB, anon-rss:1700kB, file-rss:264kB, shmem-rss:0kB
[  366.910692] oom_reaper: reaped process 470 (bash), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
[  367.913199] stress-ng-fork: page allocation failure: order:0, mode:0x40800(GFP_NOWAIT|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0
[  367.913206] CPU: 3 PID: 19951 Comm: stress-ng-fork Not tainted 5.2.0-rc3+ #999
[  367.913207] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
[  367.913208] Call Trace:
[  367.913215]  dump_stack+0x5c/0x7b
[  367.913219]  warn_alloc+0x108/0x190
[  367.913222]  __alloc_pages_slowpath+0xdc7/0xdf0
[  367.913226]  __alloc_pages_nodemask+0x2de/0x330
[  367.913230]  cache_grow_begin+0x77/0x420
[  367.913232]  fallback_alloc+0x161/0x200
[  367.913235]  kmem_cache_alloc+0x1c9/0x570
[  367.913237]  alloc_vmap_area+0x98b/0xa20
[  367.913240]  __get_vm_area_node+0xb0/0x170
[  367.913243]  __vmalloc_node_range+0x6d/0x230
[  367.913246]  ? _do_fork+0xce/0x3d0
[  367.913248]  copy_process.part.46+0x850/0x1b90
[  367.913250]  ? _do_fork+0xce/0x3d0
[  367.913254]  _do_fork+0xce/0x3d0
[  367.913257]  ? __do_page_fault+0x2bf/0x4e0
[  367.913260]  do_syscall_64+0x55/0x130
[  367.913263]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[  367.913265] RIP: 0033:0x7f2a8248d38b
[  367.913268] Code: db 45 85 f6 0f 85 95 01 00 00 64 4c 8b 04 25 10 00 00 00 31 d2 4d 8d 90 d0 02 00 00 31 f6 bf 11 00 20 01 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 de 00 00 00 85 c0 41 89 c5 0f 85 e5 00 00
[  367.913269] RSP: 002b:00007fff1b058c30 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
[  367.913271] RAX: ffffffffffffffda RBX: 00007fff1b058c30 RCX: 00007f2a8248d38b
[  367.913272] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[  367.913273] RBP: 00007fff1b058c80 R08: 00007f2a83d34300 R09: 00007fff1b1890a0
[  367.913274] R10: 00007f2a83d345d0 R11: 0000000000000246 R12: 0000000000000000
[  367.913275] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000
[  367.913278] Mem-Info:
[  367.913282] active_anon:45795 inactive_anon:80706 isolated_anon:0
                active_file:394 inactive_file:359 isolated_file:210
                unevictable:2 dirty:0 writeback:0 unstable:0
                slab_reclaimable:2691 slab_unreclaimable:21864
                mapped:80835 shmem:80740 pagetables:50422 bounce:0
                free:12185 free_pcp:776 free_cma:0
[  367.913286] Node 0 active_anon:183180kB inactive_anon:322824kB active_file:1576kB inactive_file:1436kB unevictable:8kB isolated(anon):0kB isolated(file):840kB mapped:323340kB dirty:0kB writeback:0kB shmem:322960kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
[  367.913287] Node 0 DMA free:4516kB min:724kB low:904kB high:1084kB active_anon:2384kB inactive_anon:0kB active_file:48kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB kernel_stack:1256kB pagetables:4516kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
[  367.913292] lowmem_reserve[]: 0 948 948 948
[  367.913294] Node 0 DMA32 free:44224kB min:44328kB low:55408kB high:66488kB active_anon:180252kB inactive_anon:322824kB active_file:992kB inactive_file:1332kB unevictable:8kB writepending:252kB present:1032064kB managed:995428kB mlocked:8kB kernel_stack:43260kB pagetables:197172kB bounce:0kB free_pcp:3252kB local_pcp:480kB free_cma:0kB
[  367.913299] lowmem_reserve[]: 0 0 0 0
[  367.913301] Node 0 DMA: 46*4kB (UM) 45*8kB (UM) 12*16kB (UM) 9*32kB (UM) 2*64kB (M) 2*128kB (UM) 2*256kB (M) 3*512kB (M) 1*1024kB (M) 0*2048kB 0*4096kB = 4480kB
[  367.913310] Node 0 DMA32: 966*4kB (UE) 552*8kB (UME) 648*16kB (UME) 265*32kB (UME) 75*64kB (UME) 12*128kB (ME) 1*256kB (U) 1*512kB (E) 1*1024kB (U) 2*2048kB (UM) 1*4096kB (M) = 43448kB
[  367.913322] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[  367.913323] 81750 total pagecache pages
[  367.913324] 0 pages in swap cache
[  367.913325] Swap cache stats: add 0, delete 0, find 0/0
[  367.913325] Free swap  = 0kB
[  367.913326] Total swap = 0kB
[  367.913327] 262014 pages RAM
[  367.913327] 0 pages HighMem/MovableOnly
[  367.913328] 9180 pages reserved
[  367.913329] 0 pages hwpoisoned
[  372.338733] systemd-journald[195]: /dev/kmsg buffer overrun, some messages lost.
<snip>

Whereas with "preload" logic i see only OOM killer related messages:

<snip>
[  136.787266] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,global_oom,task_memcg=/,task=systemd-journal,pid=196,uid=0
[  136.787276] Out of memory: Killed process 196 (systemd-journal) total-vm:56832kB, anon-rss:512kB, file-rss:336kB, shmem-rss:820kB
[  136.790481] oom_reaper: reaped process 196 (systemd-journal), now anon-rss:0kB, file-rss:0kB, shmem-rss:820kB
<snip>

i.e. vmalloc still able to allocate.

Probably i need to update the commit message by this simulation and finding.

--
Vlad Rezki
Roman Gushchin June 3, 2019, 9:06 p.m. UTC | #6
On Mon, Jun 03, 2019 at 10:53:34PM +0200, Uladzislau Rezki wrote:
> On Mon, Jun 03, 2019 at 07:53:12PM +0200, Uladzislau Rezki wrote:
> > Hello, Roman!
> > 
> > On Wed, May 29, 2019 at 04:34:40PM +0000, Roman Gushchin wrote:
> > > On Wed, May 29, 2019 at 04:27:15PM +0200, Uladzislau Rezki wrote:
> > > > Hello, Roman!
> > > > 
> > > > > On Mon, May 27, 2019 at 11:38:40AM +0200, Uladzislau Rezki (Sony) wrote:
> > > > > > Refactor the NE_FIT_TYPE split case when it comes to an
> > > > > > allocation of one extra object. We need it in order to
> > > > > > build a remaining space.
> > > > > > 
> > > > > > Introduce ne_fit_preload()/ne_fit_preload_end() functions
> > > > > > for preloading one extra vmap_area object to ensure that
> > > > > > we have it available when fit type is NE_FIT_TYPE.
> > > > > > 
> > > > > > The preload is done per CPU in non-atomic context thus with
> > > > > > GFP_KERNEL allocation masks. More permissive parameters can
> > > > > > be beneficial for systems which are suffer from high memory
> > > > > > pressure or low memory condition.
> > > > > > 
> > > > > > Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> > > > > > ---
> > > > > >  mm/vmalloc.c | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> > > > > >  1 file changed, 76 insertions(+), 3 deletions(-)
> > > > > 
> > > > > Hi Uladzislau!
> > > > > 
> > > > > This patch generally looks good to me (see some nits below),
> > > > > but it would be really great to add some motivation, e.g. numbers.
> > > > > 
> > > > The main goal of this patch to get rid of using GFP_NOWAIT since it is
> > > > more restricted due to allocation from atomic context. IMHO, if we can
> > > > avoid of using it that is a right way to go.
> > > > 
> > > > From the other hand, as i mentioned before i have not seen any issues
> > > > with that on all my test systems during big rework. But it could be
> > > > beneficial for tiny systems where we do not have any swap and are
> > > > limited in memory size.
> > > 
> > > Ok, that makes sense to me. Is it possible to emulate such a tiny system
> > > on kvm and measure the benefits? Again, not a strong opinion here,
> > > but it will be easier to justify adding a good chunk of code.
> > > 
> > It seems it is not so straightforward as it looks like. I tried it before,
> > but usually the systems gets panic due to out of memory or just invokes
> > the OOM killer.
> > 
> > I will upload a new version of it, where i embed "preloading" logic directly
> > into alloc_vmap_area() function.
> > 
> just managed to simulate the faulty behavior of GFP_NOWAIT restriction,
> resulting to failure of vmalloc allocation. Under heavy load and low
> memory condition and without swap, i can trigger below warning on my
> KVM machine:
> 
> <snip>
> [  366.910037] Out of memory: Killed process 470 (bash) total-vm:21012kB, anon-rss:1700kB, file-rss:264kB, shmem-rss:0kB
> [  366.910692] oom_reaper: reaped process 470 (bash), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB
> [  367.913199] stress-ng-fork: page allocation failure: order:0, mode:0x40800(GFP_NOWAIT|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0
> [  367.913206] CPU: 3 PID: 19951 Comm: stress-ng-fork Not tainted 5.2.0-rc3+ #999
> [  367.913207] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014
> [  367.913208] Call Trace:
> [  367.913215]  dump_stack+0x5c/0x7b
> [  367.913219]  warn_alloc+0x108/0x190
> [  367.913222]  __alloc_pages_slowpath+0xdc7/0xdf0
> [  367.913226]  __alloc_pages_nodemask+0x2de/0x330
> [  367.913230]  cache_grow_begin+0x77/0x420
> [  367.913232]  fallback_alloc+0x161/0x200
> [  367.913235]  kmem_cache_alloc+0x1c9/0x570
> [  367.913237]  alloc_vmap_area+0x98b/0xa20
> [  367.913240]  __get_vm_area_node+0xb0/0x170
> [  367.913243]  __vmalloc_node_range+0x6d/0x230
> [  367.913246]  ? _do_fork+0xce/0x3d0
> [  367.913248]  copy_process.part.46+0x850/0x1b90
> [  367.913250]  ? _do_fork+0xce/0x3d0
> [  367.913254]  _do_fork+0xce/0x3d0
> [  367.913257]  ? __do_page_fault+0x2bf/0x4e0
> [  367.913260]  do_syscall_64+0x55/0x130
> [  367.913263]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
> [  367.913265] RIP: 0033:0x7f2a8248d38b
> [  367.913268] Code: db 45 85 f6 0f 85 95 01 00 00 64 4c 8b 04 25 10 00 00 00 31 d2 4d 8d 90 d0 02 00 00 31 f6 bf 11 00 20 01 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 0f 87 de 00 00 00 85 c0 41 89 c5 0f 85 e5 00 00
> [  367.913269] RSP: 002b:00007fff1b058c30 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
> [  367.913271] RAX: ffffffffffffffda RBX: 00007fff1b058c30 RCX: 00007f2a8248d38b
> [  367.913272] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
> [  367.913273] RBP: 00007fff1b058c80 R08: 00007f2a83d34300 R09: 00007fff1b1890a0
> [  367.913274] R10: 00007f2a83d345d0 R11: 0000000000000246 R12: 0000000000000000
> [  367.913275] R13: 0000000000000020 R14: 0000000000000000 R15: 0000000000000000
> [  367.913278] Mem-Info:
> [  367.913282] active_anon:45795 inactive_anon:80706 isolated_anon:0
>                 active_file:394 inactive_file:359 isolated_file:210
>                 unevictable:2 dirty:0 writeback:0 unstable:0
>                 slab_reclaimable:2691 slab_unreclaimable:21864
>                 mapped:80835 shmem:80740 pagetables:50422 bounce:0
>                 free:12185 free_pcp:776 free_cma:0
> [  367.913286] Node 0 active_anon:183180kB inactive_anon:322824kB active_file:1576kB inactive_file:1436kB unevictable:8kB isolated(anon):0kB isolated(file):840kB mapped:323340kB dirty:0kB writeback:0kB shmem:322960kB shmem_thp: 0kB shmem_pmdmapped: 0kB anon_thp: 0kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
> [  367.913287] Node 0 DMA free:4516kB min:724kB low:904kB high:1084kB active_anon:2384kB inactive_anon:0kB active_file:48kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB kernel_stack:1256kB pagetables:4516kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
> [  367.913292] lowmem_reserve[]: 0 948 948 948
> [  367.913294] Node 0 DMA32 free:44224kB min:44328kB low:55408kB high:66488kB active_anon:180252kB inactive_anon:322824kB active_file:992kB inactive_file:1332kB unevictable:8kB writepending:252kB present:1032064kB managed:995428kB mlocked:8kB kernel_stack:43260kB pagetables:197172kB bounce:0kB free_pcp:3252kB local_pcp:480kB free_cma:0kB
> [  367.913299] lowmem_reserve[]: 0 0 0 0
> [  367.913301] Node 0 DMA: 46*4kB (UM) 45*8kB (UM) 12*16kB (UM) 9*32kB (UM) 2*64kB (M) 2*128kB (UM) 2*256kB (M) 3*512kB (M) 1*1024kB (M) 0*2048kB 0*4096kB = 4480kB
> [  367.913310] Node 0 DMA32: 966*4kB (UE) 552*8kB (UME) 648*16kB (UME) 265*32kB (UME) 75*64kB (UME) 12*128kB (ME) 1*256kB (U) 1*512kB (E) 1*1024kB (U) 2*2048kB (UM) 1*4096kB (M) = 43448kB
> [  367.913322] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
> [  367.913323] 81750 total pagecache pages
> [  367.913324] 0 pages in swap cache
> [  367.913325] Swap cache stats: add 0, delete 0, find 0/0
> [  367.913325] Free swap  = 0kB
> [  367.913326] Total swap = 0kB
> [  367.913327] 262014 pages RAM
> [  367.913327] 0 pages HighMem/MovableOnly
> [  367.913328] 9180 pages reserved
> [  367.913329] 0 pages hwpoisoned
> [  372.338733] systemd-journald[195]: /dev/kmsg buffer overrun, some messages lost.
> <snip>
> 
> Whereas with "preload" logic i see only OOM killer related messages:
> 
> <snip>
> [  136.787266] oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),cpuset=/,mems_allowed=0,global_oom,task_memcg=/,task=systemd-journal,pid=196,uid=0
> [  136.787276] Out of memory: Killed process 196 (systemd-journal) total-vm:56832kB, anon-rss:512kB, file-rss:336kB, shmem-rss:820kB
> [  136.790481] oom_reaper: reaped process 196 (systemd-journal), now anon-rss:0kB, file-rss:0kB, shmem-rss:820kB
> <snip>
> 
> i.e. vmalloc still able to allocate.
> 
> Probably i need to update the commit message by this simulation and finding.

Ah, perfect! Than it makes total sense to me.

Thanks!
diff mbox series

Patch

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ea1b65fac599..b553047aa05b 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -364,6 +364,13 @@  static LIST_HEAD(free_vmap_area_list);
  */
 static struct rb_root free_vmap_area_root = RB_ROOT;
 
+/*
+ * Preload a CPU with one object for "no edge" split case. The
+ * aim is to get rid of allocations from the atomic context, thus
+ * to use more permissive allocation masks.
+ */
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
+
 static __always_inline unsigned long
 va_size(struct vmap_area *va)
 {
@@ -950,9 +957,24 @@  adjust_va_to_fit_type(struct vmap_area *va,
 		 *   L V  NVA  V R
 		 * |---|-------|---|
 		 */
-		lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
-		if (unlikely(!lva))
-			return -1;
+		lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
+		if (unlikely(!lva)) {
+			/*
+			 * For percpu allocator we do not do any pre-allocation
+			 * and leave it as it is. The reason is it most likely
+			 * never ends up with NE_FIT_TYPE splitting. In case of
+			 * percpu allocations offsets and sizes are aligned to
+			 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
+			 * are its main fitting cases.
+			 *
+			 * There are a few exceptions though, as an example it is
+			 * a first allocation (early boot up) when we have "one"
+			 * big free space that has to be split.
+			 */
+			lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+			if (!lva)
+				return -1;
+		}
 
 		/*
 		 * Build the remainder.
@@ -1023,6 +1045,48 @@  __alloc_vmap_area(unsigned long size, unsigned long align,
 }
 
 /*
+ * Preload this CPU with one extra vmap_area object to ensure
+ * that we have it available when fit type of free area is
+ * NE_FIT_TYPE.
+ *
+ * The preload is done in non-atomic context, thus it allows us
+ * to use more permissive allocation masks to be more stable under
+ * low memory condition and high memory pressure.
+ *
+ * If success it returns 1 with preemption disabled. In case
+ * of error 0 is returned with preemption not disabled. Note it
+ * has to be paired with ne_fit_preload_end().
+ */
+static int
+ne_fit_preload(int nid)
+{
+	preempt_disable();
+
+	if (!__this_cpu_read(ne_fit_preload_node)) {
+		struct vmap_area *node;
+
+		preempt_enable();
+		node = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, nid);
+		if (node == NULL)
+			return 0;
+
+		preempt_disable();
+
+		if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, node))
+			kmem_cache_free(vmap_area_cachep, node);
+	}
+
+	return 1;
+}
+
+static void
+ne_fit_preload_end(int preloaded)
+{
+	if (preloaded)
+		preempt_enable();
+}
+
+/*
  * Allocate a region of KVA of the specified size and alignment, within the
  * vstart and vend.
  */
@@ -1034,6 +1098,7 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 	struct vmap_area *va;
 	unsigned long addr;
 	int purged = 0;
+	int preloaded;
 
 	BUG_ON(!size);
 	BUG_ON(offset_in_page(size));
@@ -1056,6 +1121,12 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 	kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
 
 retry:
+	/*
+	 * Even if it fails we do not really care about that.
+	 * Just proceed as it is. "overflow" path will refill
+	 * the cache we allocate from.
+	 */
+	preloaded = ne_fit_preload(node);
 	spin_lock(&vmap_area_lock);
 
 	/*
@@ -1063,6 +1134,8 @@  static struct vmap_area *alloc_vmap_area(unsigned long size,
 	 * returned. Therefore trigger the overflow path.
 	 */
 	addr = __alloc_vmap_area(size, align, vstart, vend);
+	ne_fit_preload_end(preloaded);
+
 	if (unlikely(addr == vend))
 		goto overflow;