diff mbox

[RFC,2/2] mm, mempool: do not throttle PF_LESS_THROTTLE tasks

Message ID 1468831285-27242-2-git-send-email-mhocko@kernel.org (mailing list archive)
State Not Applicable, archived
Delegated to: Mike Snitzer
Headers show

Commit Message

Michal Hocko July 18, 2016, 8:41 a.m. UTC
From: Michal Hocko <mhocko@suse.com>

Mikulas has reported that a swap backed by dm-crypt doesn't work
properly because the swapout cannot make a sufficient forward progress
as the writeout path depends on dm_crypt worker which has to allocate
memory to perform the encryption. In order to guarantee a forward
progress it relies on the mempool allocator. mempool_alloc(), however,
prefers to use the underlying (usually page) allocator before it grabs
objects from the pool. Such an allocation can dive into the memory
reclaim and consequently to throttle_vm_writeout. If there are too many
dirty or pages under writeback it will get throttled even though it is
in fact a flusher to clear pending pages.

[  345.352536] kworker/u4:0    D ffff88003df7f438 10488     6      2 0x00000000
[  345.352536] Workqueue: kcryptd kcryptd_crypt [dm_crypt]
[  345.352536]  ffff88003df7f438 ffff88003e5d0380 ffff88003e5d0380 ffff88003e5d8e80
[  345.352536]  ffff88003dfb3240 ffff88003df73240 ffff88003df80000 ffff88003df7f470
[  345.352536]  ffff88003e5d0380 ffff88003e5d0380 ffff88003df7f828 ffff88003df7f450
[  345.352536] Call Trace:
[  345.352536]  [<ffffffff818d466c>] schedule+0x3c/0x90
[  345.352536]  [<ffffffff818d96a8>] schedule_timeout+0x1d8/0x360
[  345.352536]  [<ffffffff81135e40>] ? detach_if_pending+0x1c0/0x1c0
[  345.352536]  [<ffffffff811407c3>] ? ktime_get+0xb3/0x150
[  345.352536]  [<ffffffff811958cf>] ? __delayacct_blkio_start+0x1f/0x30
[  345.352536]  [<ffffffff818d39e4>] io_schedule_timeout+0xa4/0x110
[  345.352536]  [<ffffffff8121d886>] congestion_wait+0x86/0x1f0
[  345.352536]  [<ffffffff810fdf40>] ? prepare_to_wait_event+0xf0/0xf0
[  345.352536]  [<ffffffff812061d4>] throttle_vm_writeout+0x44/0xd0
[  345.352536]  [<ffffffff81211533>] shrink_zone_memcg+0x613/0x720
[  345.352536]  [<ffffffff81211720>] shrink_zone+0xe0/0x300
[  345.352536]  [<ffffffff81211aed>] do_try_to_free_pages+0x1ad/0x450
[  345.352536]  [<ffffffff81211e7f>] try_to_free_pages+0xef/0x300
[  345.352536]  [<ffffffff811fef19>] __alloc_pages_nodemask+0x879/0x1210
[  345.352536]  [<ffffffff810e8080>] ? sched_clock_cpu+0x90/0xc0
[  345.352536]  [<ffffffff8125a8d1>] alloc_pages_current+0xa1/0x1f0
[  345.352536]  [<ffffffff81265ef5>] ? new_slab+0x3f5/0x6a0
[  345.352536]  [<ffffffff81265dd7>] new_slab+0x2d7/0x6a0
[  345.352536]  [<ffffffff810e7f87>] ? sched_clock_local+0x17/0x80
[  345.352536]  [<ffffffff812678cb>] ___slab_alloc+0x3fb/0x5c0
[  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
[  345.352536]  [<ffffffff810e7f87>] ? sched_clock_local+0x17/0x80
[  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
[  345.352536]  [<ffffffff81267ae1>] __slab_alloc+0x51/0x90
[  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
[  345.352536]  [<ffffffff81267d9b>] kmem_cache_alloc+0x27b/0x310
[  345.352536]  [<ffffffff811f71bd>] mempool_alloc_slab+0x1d/0x30
[  345.352536]  [<ffffffff811f6f11>] mempool_alloc+0x91/0x230
[  345.352536]  [<ffffffff8141a02d>] bio_alloc_bioset+0xbd/0x260
[  345.352536]  [<ffffffffc02f1a54>] kcryptd_crypt+0x114/0x3b0 [dm_crypt]

Memory pools are usually used for the writeback paths and it doesn't
really make much sense to throttle them just because there are too many
dirty/writeback pages. The main purpose of throttle_vm_writeout is to
make sure that the pageout path doesn't generate too much dirty data.
Considering that we are in mempool path which performs __GFP_NORETRY
requests the risk shouldn't be really high.

Fix this by ensuring that mempool users will get PF_LESS_THROTTLE and
that such processes are not throttled in throttle_vm_writeout. They can
still get throttled due to current_may_throttle() sleeps but that should
happen when the backing device itself is congested which sounds like a
proper reaction.

Please note that the bonus given by domain_dirty_limits() alone is not
sufficient because at least dm-crypt has to double buffer each page
under writeback so this won't be sufficient to prevent from being
throttled.

There are other users of the flag but they are in the writeout path so
this looks like a proper thing for them as well.

Reported-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
---
 mm/mempool.c        | 19 +++++++++++++++----
 mm/page-writeback.c |  3 +++
 2 files changed, 18 insertions(+), 4 deletions(-)

Comments

Mikulas Patocka July 19, 2016, 9:50 p.m. UTC | #1
On Mon, 18 Jul 2016, Michal Hocko wrote:

> From: Michal Hocko <mhocko@suse.com>
> 
> Mikulas has reported that a swap backed by dm-crypt doesn't work
> properly because the swapout cannot make a sufficient forward progress
> as the writeout path depends on dm_crypt worker which has to allocate
> memory to perform the encryption. In order to guarantee a forward
> progress it relies on the mempool allocator. mempool_alloc(), however,
> prefers to use the underlying (usually page) allocator before it grabs
> objects from the pool. Such an allocation can dive into the memory
> reclaim and consequently to throttle_vm_writeout. If there are too many
> dirty or pages under writeback it will get throttled even though it is
> in fact a flusher to clear pending pages.
> 
> [  345.352536] kworker/u4:0    D ffff88003df7f438 10488     6      2 0x00000000
> [  345.352536] Workqueue: kcryptd kcryptd_crypt [dm_crypt]
> [  345.352536]  ffff88003df7f438 ffff88003e5d0380 ffff88003e5d0380 ffff88003e5d8e80
> [  345.352536]  ffff88003dfb3240 ffff88003df73240 ffff88003df80000 ffff88003df7f470
> [  345.352536]  ffff88003e5d0380 ffff88003e5d0380 ffff88003df7f828 ffff88003df7f450
> [  345.352536] Call Trace:
> [  345.352536]  [<ffffffff818d466c>] schedule+0x3c/0x90
> [  345.352536]  [<ffffffff818d96a8>] schedule_timeout+0x1d8/0x360
> [  345.352536]  [<ffffffff81135e40>] ? detach_if_pending+0x1c0/0x1c0
> [  345.352536]  [<ffffffff811407c3>] ? ktime_get+0xb3/0x150
> [  345.352536]  [<ffffffff811958cf>] ? __delayacct_blkio_start+0x1f/0x30
> [  345.352536]  [<ffffffff818d39e4>] io_schedule_timeout+0xa4/0x110
> [  345.352536]  [<ffffffff8121d886>] congestion_wait+0x86/0x1f0
> [  345.352536]  [<ffffffff810fdf40>] ? prepare_to_wait_event+0xf0/0xf0
> [  345.352536]  [<ffffffff812061d4>] throttle_vm_writeout+0x44/0xd0
> [  345.352536]  [<ffffffff81211533>] shrink_zone_memcg+0x613/0x720
> [  345.352536]  [<ffffffff81211720>] shrink_zone+0xe0/0x300
> [  345.352536]  [<ffffffff81211aed>] do_try_to_free_pages+0x1ad/0x450
> [  345.352536]  [<ffffffff81211e7f>] try_to_free_pages+0xef/0x300
> [  345.352536]  [<ffffffff811fef19>] __alloc_pages_nodemask+0x879/0x1210
> [  345.352536]  [<ffffffff810e8080>] ? sched_clock_cpu+0x90/0xc0
> [  345.352536]  [<ffffffff8125a8d1>] alloc_pages_current+0xa1/0x1f0
> [  345.352536]  [<ffffffff81265ef5>] ? new_slab+0x3f5/0x6a0
> [  345.352536]  [<ffffffff81265dd7>] new_slab+0x2d7/0x6a0
> [  345.352536]  [<ffffffff810e7f87>] ? sched_clock_local+0x17/0x80
> [  345.352536]  [<ffffffff812678cb>] ___slab_alloc+0x3fb/0x5c0
> [  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff810e7f87>] ? sched_clock_local+0x17/0x80
> [  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff81267ae1>] __slab_alloc+0x51/0x90
> [  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff81267d9b>] kmem_cache_alloc+0x27b/0x310
> [  345.352536]  [<ffffffff811f71bd>] mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff811f6f11>] mempool_alloc+0x91/0x230
> [  345.352536]  [<ffffffff8141a02d>] bio_alloc_bioset+0xbd/0x260
> [  345.352536]  [<ffffffffc02f1a54>] kcryptd_crypt+0x114/0x3b0 [dm_crypt]
> 
> Memory pools are usually used for the writeback paths and it doesn't
> really make much sense to throttle them just because there are too many
> dirty/writeback pages. The main purpose of throttle_vm_writeout is to
> make sure that the pageout path doesn't generate too much dirty data.
> Considering that we are in mempool path which performs __GFP_NORETRY
> requests the risk shouldn't be really high.
> 
> Fix this by ensuring that mempool users will get PF_LESS_THROTTLE and
> that such processes are not throttled in throttle_vm_writeout. They can
> still get throttled due to current_may_throttle() sleeps but that should
> happen when the backing device itself is congested which sounds like a
> proper reaction.
> 
> Please note that the bonus given by domain_dirty_limits() alone is not
> sufficient because at least dm-crypt has to double buffer each page
> under writeback so this won't be sufficient to prevent from being
> throttled.
> 
> There are other users of the flag but they are in the writeout path so
> this looks like a proper thing for them as well.
> 
> Reported-by: Mikulas Patocka <mpatocka@redhat.com>
> Signed-off-by: Michal Hocko <mhocko@suse.com>

Reviewed-by: Mikulas Patocka <mpatocka@redhat.com>
Tested-by: Mikulas Patocka <mpatocka@redhat.com>

> ---
>  mm/mempool.c        | 19 +++++++++++++++----
>  mm/page-writeback.c |  3 +++
>  2 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/mm/mempool.c b/mm/mempool.c
> index ea26d75c8adf..916e95c4192c 100644
> --- a/mm/mempool.c
> +++ b/mm/mempool.c
> @@ -310,7 +310,8 @@ EXPORT_SYMBOL(mempool_resize);
>   */
>  void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
>  {
> -	void *element;
> +	unsigned int pflags = current->flags;
> +	void *element = NULL;
>  	unsigned long flags;
>  	wait_queue_t wait;
>  	gfp_t gfp_temp;
> @@ -328,6 +329,12 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
>  
>  	gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
>  
> +	/*
> +	 * Make sure that the allocation doesn't get throttled during the
> +	 * reclaim
> +	 */
> +	if (gfpflags_allow_blocking(gfp_mask))
> +		current->flags |= PF_LESS_THROTTLE;
>  repeat_alloc:
>  	/*
>  	 * Make sure that the OOM victim will get access to memory reserves
> @@ -339,7 +346,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
>  
>  	element = pool->alloc(gfp_temp, pool->pool_data);
>  	if (likely(element != NULL))
> -		return element;
> +		goto out;
>  
>  	spin_lock_irqsave(&pool->lock, flags);
>  	if (likely(pool->curr_nr)) {
> @@ -352,7 +359,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
>  		 * for debugging.
>  		 */
>  		kmemleak_update_trace(element);
> -		return element;
> +		goto out;
>  	}
>  
>  	/*
> @@ -369,7 +376,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
>  	/* We must not sleep if !__GFP_DIRECT_RECLAIM */
>  	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
>  		spin_unlock_irqrestore(&pool->lock, flags);
> -		return NULL;
> +		goto out;
>  	}
>  
>  	/* Let's wait for someone else to return an element to @pool */
> @@ -386,6 +393,10 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
>  
>  	finish_wait(&pool->wait, &wait);
>  	goto repeat_alloc;
> +out:
> +	if (gfpflags_allow_blocking(gfp_mask))
> +		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
> +	return element;
>  }
>  EXPORT_SYMBOL(mempool_alloc);
>  
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 7fbb2d008078..a37661f1a11b 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1971,6 +1971,9 @@ void throttle_vm_writeout(gfp_t gfp_mask)
>  	unsigned long background_thresh;
>  	unsigned long dirty_thresh;
>  
> +	if (current->flags & PF_LESS_THROTTLE)
> +		return;
> +
>          for ( ; ; ) {
>  		global_dirty_limits(&background_thresh, &dirty_thresh);
>  		dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
> -- 
> 2.8.1
> 

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
NeilBrown July 22, 2016, 8:46 a.m. UTC | #2
On Mon, Jul 18 2016, Michal Hocko wrote:

> From: Michal Hocko <mhocko@suse.com>
>
> Mikulas has reported that a swap backed by dm-crypt doesn't work
> properly because the swapout cannot make a sufficient forward progress
> as the writeout path depends on dm_crypt worker which has to allocate
> memory to perform the encryption. In order to guarantee a forward
> progress it relies on the mempool allocator. mempool_alloc(), however,
> prefers to use the underlying (usually page) allocator before it grabs
> objects from the pool. Such an allocation can dive into the memory
> reclaim and consequently to throttle_vm_writeout.

That's just broken.
I used to think mempool should always use the pre-allocated reserves
first.  That is surely the most logical course of action.  Otherwise
that memory is just sitting there doing nothing useful.

I spoke to Nick Piggin about this some years ago and he pointed out that
the kmalloc allocation paths are much better optimized for low overhead
when there is plenty of memory.  They can just pluck a free block of a
per-CPU list without taking any locks.   By contrast, accessing the
preallocated pool always requires a spinlock.

So it makes lots of sense to prefer the underlying allocator if it can
provide a quick response.  If it cannot, the sensible thing is to use
the pool, or wait for the pool to be replenished.

So the allocator should never wait at all, never enter reclaim, never
throttle.

Looking at the current code, __GFP_DIRECT_RECLAIM is disabled the first
time through, but if the pool is empty, direct-reclaim is allowed on the
next attempt.  Presumably this is where the throttling comes in ??  I
suspect that it really shouldn't do that. It should leave kswapd to do
reclaim (so __GFP_KSWAPD_RECLAIM is appropriate) and only wait in
mempool_alloc where pool->wait can wake it up.

If I'm following the code properly, the stack trace below can only
happen if the first pool->alloc() attempt, with direct-reclaim disabled,
fails and the pool is empty, so mempool_alloc() calls prepare_to_wait()
and io_schedule_timeout().
I suspect the timeout *doesn't* fire (5 seconds is along time) so it
gets woken up when there is something in the pool.  It then loops around
and tries pool->alloc() again, even though there is something in the
pool.  This might be justified if that ->alloc would never block, but
obviously it does.

I would very strongly recommend just changing mempool_alloc() to
permanently mask out __GFP_DIRECT_RECLAIM.

Quite separately I don't think PF_LESS_THROTTLE is at all appropriate.
It is "LESS" throttle, not "NO" throttle, but you have made
throttle_vm_writeout never throttle PF_LESS_THROTTLE threads.
The purpose of that flag is to allow a thread to dirty a page-cache page
as part of cleaning another page-cache page.
So it makes sense for loop and sometimes for nfsd.  It would make sense
for dm-crypt if it was putting the encrypted version in the page cache.
But if dm-crypt is just allocating a transient page (which I think it
is), then a mempool should be sufficient (and we should make sure it is
sufficient) and access to an extra 10% (or whatever) of the page cache
isn't justified.

Thanks,
NeilBrown



 If there are too many
> dirty or pages under writeback it will get throttled even though it is
> in fact a flusher to clear pending pages.
>
> [  345.352536] kworker/u4:0    D ffff88003df7f438 10488     6      2 0x00000000
> [  345.352536] Workqueue: kcryptd kcryptd_crypt [dm_crypt]
> [  345.352536]  ffff88003df7f438 ffff88003e5d0380 ffff88003e5d0380 ffff88003e5d8e80
> [  345.352536]  ffff88003dfb3240 ffff88003df73240 ffff88003df80000 ffff88003df7f470
> [  345.352536]  ffff88003e5d0380 ffff88003e5d0380 ffff88003df7f828 ffff88003df7f450
> [  345.352536] Call Trace:
> [  345.352536]  [<ffffffff818d466c>] schedule+0x3c/0x90
> [  345.352536]  [<ffffffff818d96a8>] schedule_timeout+0x1d8/0x360
> [  345.352536]  [<ffffffff81135e40>] ? detach_if_pending+0x1c0/0x1c0
> [  345.352536]  [<ffffffff811407c3>] ? ktime_get+0xb3/0x150
> [  345.352536]  [<ffffffff811958cf>] ? __delayacct_blkio_start+0x1f/0x30
> [  345.352536]  [<ffffffff818d39e4>] io_schedule_timeout+0xa4/0x110
> [  345.352536]  [<ffffffff8121d886>] congestion_wait+0x86/0x1f0
> [  345.352536]  [<ffffffff810fdf40>] ? prepare_to_wait_event+0xf0/0xf0
> [  345.352536]  [<ffffffff812061d4>] throttle_vm_writeout+0x44/0xd0
> [  345.352536]  [<ffffffff81211533>] shrink_zone_memcg+0x613/0x720
> [  345.352536]  [<ffffffff81211720>] shrink_zone+0xe0/0x300
> [  345.352536]  [<ffffffff81211aed>] do_try_to_free_pages+0x1ad/0x450
> [  345.352536]  [<ffffffff81211e7f>] try_to_free_pages+0xef/0x300
> [  345.352536]  [<ffffffff811fef19>] __alloc_pages_nodemask+0x879/0x1210
> [  345.352536]  [<ffffffff810e8080>] ? sched_clock_cpu+0x90/0xc0
> [  345.352536]  [<ffffffff8125a8d1>] alloc_pages_current+0xa1/0x1f0
> [  345.352536]  [<ffffffff81265ef5>] ? new_slab+0x3f5/0x6a0
> [  345.352536]  [<ffffffff81265dd7>] new_slab+0x2d7/0x6a0
> [  345.352536]  [<ffffffff810e7f87>] ? sched_clock_local+0x17/0x80
> [  345.352536]  [<ffffffff812678cb>] ___slab_alloc+0x3fb/0x5c0
> [  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff810e7f87>] ? sched_clock_local+0x17/0x80
> [  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff81267ae1>] __slab_alloc+0x51/0x90
> [  345.352536]  [<ffffffff811f71bd>] ? mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff81267d9b>] kmem_cache_alloc+0x27b/0x310
> [  345.352536]  [<ffffffff811f71bd>] mempool_alloc_slab+0x1d/0x30
> [  345.352536]  [<ffffffff811f6f11>] mempool_alloc+0x91/0x230
> [  345.352536]  [<ffffffff8141a02d>] bio_alloc_bioset+0xbd/0x260
> [  345.352536]  [<ffffffffc02f1a54>] kcryptd_crypt+0x114/0x3b0 [dm_crypt]
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
NeilBrown July 22, 2016, 9:04 a.m. UTC | #3
On Fri, Jul 22 2016, NeilBrown wrote:

>
> Looking at the current code, __GFP_DIRECT_RECLAIM is disabled the first
> time through, but if the pool is empty, direct-reclaim is allowed on the
> next attempt.  Presumably this is where the throttling comes in ??  I
> suspect that it really shouldn't do that. It should leave kswapd to do
> reclaim (so __GFP_KSWAPD_RECLAIM is appropriate) and only wait in
> mempool_alloc where pool->wait can wake it up.

Actually, thinking about the kswapd connection, it might make sense
for mempool_alloc() to wait in the relevant pgdata->pfmemalloc_wait as
well as waiting on pool->wait.  What way it should be able to proceed as
soon as any memory is available.  I don't know what the correct 'pgdata'
is though.

Just a thought,
NeilBrown
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Michal Hocko July 22, 2016, 9:15 a.m. UTC | #4
On Fri 22-07-16 18:46:57, Neil Brown wrote:
> On Mon, Jul 18 2016, Michal Hocko wrote:
> 
> > From: Michal Hocko <mhocko@suse.com>
> >
> > Mikulas has reported that a swap backed by dm-crypt doesn't work
> > properly because the swapout cannot make a sufficient forward progress
> > as the writeout path depends on dm_crypt worker which has to allocate
> > memory to perform the encryption. In order to guarantee a forward
> > progress it relies on the mempool allocator. mempool_alloc(), however,
> > prefers to use the underlying (usually page) allocator before it grabs
> > objects from the pool. Such an allocation can dive into the memory
> > reclaim and consequently to throttle_vm_writeout.
> 
> That's just broken.
> I used to think mempool should always use the pre-allocated reserves
> first.  That is surely the most logical course of action.  Otherwise
> that memory is just sitting there doing nothing useful.
> 
> I spoke to Nick Piggin about this some years ago and he pointed out that
> the kmalloc allocation paths are much better optimized for low overhead
> when there is plenty of memory.  They can just pluck a free block of a
> per-CPU list without taking any locks.   By contrast, accessing the
> preallocated pool always requires a spinlock.
> 
> So it makes lots of sense to prefer the underlying allocator if it can
> provide a quick response.  If it cannot, the sensible thing is to use
> the pool, or wait for the pool to be replenished.
> 
> So the allocator should never wait at all, never enter reclaim, never
> throttle.
> 
> Looking at the current code, __GFP_DIRECT_RECLAIM is disabled the first
> time through, but if the pool is empty, direct-reclaim is allowed on the
> next attempt.  Presumably this is where the throttling comes in ??

Yes that is correct.

> I suspect that it really shouldn't do that. It should leave kswapd to
> do reclaim (so __GFP_KSWAPD_RECLAIM is appropriate) and only wait in
> mempool_alloc where pool->wait can wake it up.

Mikulas was already suggesting that and my concern was that this would
give up prematurely even under mild page cache load when there are many
clean page cache pages. If we just back off and rely on kswapd which
might get stuck on the writeout then the IO throughput can be reduced
I believe which would make the whole memory pressure just worse. So I am
not sure this is a good idea in general. I completely agree with you
that the mempool request shouldn't be throttled unless there is a strong
reason for that. More on that below.

> If I'm following the code properly, the stack trace below can only
> happen if the first pool->alloc() attempt, with direct-reclaim disabled,
> fails and the pool is empty, so mempool_alloc() calls prepare_to_wait()
> and io_schedule_timeout().

mempool_alloc retries immediatelly without any sleep after the first
no-reclaim attempt.

> I suspect the timeout *doesn't* fire (5 seconds is along time) so it
> gets woken up when there is something in the pool.  It then loops around
> and tries pool->alloc() again, even though there is something in the
> pool.  This might be justified if that ->alloc would never block, but
> obviously it does.
> 
> I would very strongly recommend just changing mempool_alloc() to
> permanently mask out __GFP_DIRECT_RECLAIM.
> 
> Quite separately I don't think PF_LESS_THROTTLE is at all appropriate.
> It is "LESS" throttle, not "NO" throttle, but you have made
> throttle_vm_writeout never throttle PF_LESS_THROTTLE threads.

Yes that is correct. But it still allows to throttle on congestion:
shrink_inactive_list:
	/*
	 * Stall direct reclaim for IO completions if underlying BDIs or zone
	 * is congested. Allow kswapd to continue until it starts encountering
	 * unqueued dirty pages or cycling through the LRU too quickly.
	 */
	if (!sc->hibernation_mode && !current_is_kswapd() &&
	    current_may_throttle())
		wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);

My thinking was that throttle_vm_writeout is there to prevent from
dirtying too many pages from the reclaim the context.  PF_LESS_THROTTLE
is part of the writeout so throttling it on too many dirty pages is
questionable (well we get some bias but that is not really reliable). It
still makes sense to throttle when the backing device is congested
because the writeout path wouldn't make much progress anyway and we also
do not want to cycle through LRU lists too quickly in that case.

Or is this assumption wrong for nfsd_vfs_write? Can it cause unbounded
dirtying of memory?

> The purpose of that flag is to allow a thread to dirty a page-cache page
> as part of cleaning another page-cache page.
> So it makes sense for loop and sometimes for nfsd.  It would make sense
> for dm-crypt if it was putting the encrypted version in the page cache.
> But if dm-crypt is just allocating a transient page (which I think it
> is), then a mempool should be sufficient (and we should make sure it is
> sufficient) and access to an extra 10% (or whatever) of the page cache
> isn't justified.

If you think that PF_LESS_THROTTLE (ab)use in mempool_alloc is not
appropriate then would a PF_MEMPOOL be any better?

Thanks!
diff mbox

Patch

diff --git a/mm/mempool.c b/mm/mempool.c
index ea26d75c8adf..916e95c4192c 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -310,7 +310,8 @@  EXPORT_SYMBOL(mempool_resize);
  */
 void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
-	void *element;
+	unsigned int pflags = current->flags;
+	void *element = NULL;
 	unsigned long flags;
 	wait_queue_t wait;
 	gfp_t gfp_temp;
@@ -328,6 +329,12 @@  void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 
 	gfp_temp = gfp_mask & ~(__GFP_DIRECT_RECLAIM|__GFP_IO);
 
+	/*
+	 * Make sure that the allocation doesn't get throttled during the
+	 * reclaim
+	 */
+	if (gfpflags_allow_blocking(gfp_mask))
+		current->flags |= PF_LESS_THROTTLE;
 repeat_alloc:
 	/*
 	 * Make sure that the OOM victim will get access to memory reserves
@@ -339,7 +346,7 @@  void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 
 	element = pool->alloc(gfp_temp, pool->pool_data);
 	if (likely(element != NULL))
-		return element;
+		goto out;
 
 	spin_lock_irqsave(&pool->lock, flags);
 	if (likely(pool->curr_nr)) {
@@ -352,7 +359,7 @@  void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 		 * for debugging.
 		 */
 		kmemleak_update_trace(element);
-		return element;
+		goto out;
 	}
 
 	/*
@@ -369,7 +376,7 @@  void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 	/* We must not sleep if !__GFP_DIRECT_RECLAIM */
 	if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
 		spin_unlock_irqrestore(&pool->lock, flags);
-		return NULL;
+		goto out;
 	}
 
 	/* Let's wait for someone else to return an element to @pool */
@@ -386,6 +393,10 @@  void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 
 	finish_wait(&pool->wait, &wait);
 	goto repeat_alloc;
+out:
+	if (gfpflags_allow_blocking(gfp_mask))
+		tsk_restore_flags(current, pflags, PF_LESS_THROTTLE);
+	return element;
 }
 EXPORT_SYMBOL(mempool_alloc);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7fbb2d008078..a37661f1a11b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1971,6 +1971,9 @@  void throttle_vm_writeout(gfp_t gfp_mask)
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 
+	if (current->flags & PF_LESS_THROTTLE)
+		return;
+
         for ( ; ; ) {
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 		dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);