diff mbox series

[RFC] vmscan.c: add a sysctl entry for controlling memory reclaim IO congestion_wait length

Message ID 20190917115824.16990-1-linf@wangsu.com (mailing list archive)
State New, archived
Headers show
Series [RFC] vmscan.c: add a sysctl entry for controlling memory reclaim IO congestion_wait length | expand

Commit Message

Lin Feng Sept. 17, 2019, 11:58 a.m. UTC
This sysctl is named as mm_reclaim_congestion_wait_jiffies, default to
HZ/10 as unchanged to old codes.

It is in jiffies unit and can be set in range between [1, 100], so
refers to CONFIG_HZ before tuning.

In direct and background(kswapd) pages reclaim paths both may fall into
calling msleep(100) or congestion_wait(HZ/10) or wait_iff_congested(HZ/10)
while under IO pressure, and the sleep length is hard-coded and the later
two will introduce 100ms iowait length per time.

So if pages reclaim is relatively active in some circumstances such as high
order pages reappings, it's possible to see a lot of iowait introduced by
congestion_wait(HZ/10) and wait_iff_congested(HZ/10).

The 100ms sleep length is proper if the backing drivers are slow like
traditionnal rotation disks. While if the backing drivers are high-end
storages such as high iops ssds or even faster drivers, the high iowait
inroduced by pages reclaim is really misleading, because the storage IO
utils seen by iostat is quite low, in this case the congestion_wait time
modified to 1ms is likely enough for high-end ssds.

Another benifit is that it's potentially shorter the direct reclaim blocked
time when kernel falls into sync reclaim path, which may improve user
applications response time.

All ssds box is a trend, so introduce this sysctl entry for making a way
to relieving the concerns of system administrators.

Tested:
1. Before this patch:

top - 10:10:40 up 8 days, 16:22,  4 users,  load average: 2.21, 2.15, 2.10
Tasks: 718 total,   5 running, 712 sleeping,   0 stopped,   1 zombie
Cpu0  :  0.3%us,  3.4%sy,  0.0%ni, 95.3%id,  1.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu1  :  1.4%us,  1.7%sy,  0.0%ni, 95.2%id,  0.0%wa,  0.0%hi,  1.7%si,  0.0%st
Cpu2  :  4.7%us,  3.3%sy,  0.0%ni, 91.0%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
Cpu3  :  7.0%us,  3.7%sy,  0.0%ni, 87.7%id,  1.0%wa,  0.0%hi,  0.7%si,  0.0%st
Cpu4  :  1.0%us,  2.0%sy,  0.0%ni, 96.3%id,  0.0%wa,  0.0%hi,  0.7%si,  0.0%st
Cpu5  :  1.0%us,  2.0%sy,  0.0%ni,  1.7%id, 95.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu6  :  1.0%us,  1.3%sy,  0.0%ni, 97.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu7  :  1.3%us,  1.0%sy,  0.0%ni, 97.7%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu8  :  4.3%us,  1.3%sy,  0.0%ni, 94.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu9  :  0.7%us,  0.7%sy,  0.0%ni, 98.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu10 :  0.7%us,  1.0%sy,  0.0%ni, 98.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu11 :  1.0%us,  1.0%sy,  0.0%ni, 97.7%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu12 :  3.0%us,  1.0%sy,  0.0%ni, 95.3%id,  0.3%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu13 :  0.3%us,  1.3%sy,  0.0%ni, 88.6%id,  9.4%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu14 :  3.3%us,  2.3%sy,  0.0%ni, 93.7%id,  0.3%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu15 :  6.4%us,  3.0%sy,  0.0%ni, 90.2%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu16 :  2.7%us,  1.7%sy,  0.0%ni, 95.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu17 :  1.0%us,  1.7%sy,  0.0%ni, 97.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu18 :  1.3%us,  1.0%sy,  0.0%ni, 97.0%id,  0.3%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu19 :  4.3%us,  1.7%sy,  0.0%ni, 86.0%id,  7.7%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu20 :  0.7%us,  1.3%sy,  0.0%ni, 97.7%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu21 :  0.3%us,  1.7%sy,  0.0%ni, 50.2%id, 47.5%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu22 :  0.7%us,  0.7%sy,  0.0%ni, 98.7%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu23 :  0.7%us,  0.7%sy,  0.0%ni, 98.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st

2. After this patch and set mm_reclaim_congestion_wait_jiffies to 1:

top - 10:12:19 up 8 days, 16:24,  4 users,  load average: 1.32, 1.93, 2.03
Tasks: 724 total,   2 running, 721 sleeping,   0 stopped,   1 zombie
Cpu0  :  4.4%us,  3.0%sy,  0.0%ni, 90.3%id,  1.3%wa,  0.0%hi,  1.0%si,  0.0%st
Cpu1  :  2.1%us,  1.4%sy,  0.0%ni, 93.5%id,  0.7%wa,  0.0%hi,  2.4%si,  0.0%st
Cpu2  :  2.7%us,  1.0%sy,  0.0%ni, 96.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu3  :  1.0%us,  1.0%sy,  0.0%ni, 97.7%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu4  :  0.7%us,  1.0%sy,  0.0%ni, 97.7%id,  0.3%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu5  :  1.0%us,  0.7%sy,  0.0%ni, 97.7%id,  0.3%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu6  :  1.7%us,  1.0%sy,  0.0%ni, 97.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu7  :  2.0%us,  0.7%sy,  0.0%ni, 94.3%id,  2.7%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu8  :  2.0%us,  0.7%sy,  0.0%ni, 97.0%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu9  :  0.7%us,  1.0%sy,  0.0%ni, 97.7%id,  0.7%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu10 :  0.3%us,  0.3%sy,  0.0%ni, 99.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu11 :  0.7%us,  0.3%sy,  0.0%ni, 99.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu12 :  0.7%us,  1.0%sy,  0.0%ni, 98.0%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu13 :  0.0%us,  0.3%sy,  0.0%ni, 99.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu14 :  1.7%us,  0.7%sy,  0.0%ni, 97.3%id,  0.3%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu15 :  4.3%us,  1.0%sy,  0.0%ni, 94.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu16 :  1.7%us,  1.3%sy,  0.0%ni, 96.3%id,  0.0%wa,  0.0%hi,  0.7%si,  0.0%st
Cpu17 :  2.0%us,  1.3%sy,  0.0%ni, 96.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu18 :  0.3%us,  0.3%sy,  0.0%ni, 99.3%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu19 :  1.0%us,  1.0%sy,  0.0%ni, 97.6%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu20 :  1.3%us,  0.7%sy,  0.0%ni, 97.0%id,  0.7%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu21 :  0.7%us,  0.7%sy,  0.0%ni, 98.3%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu22 :  1.0%us,  1.0%sy,  0.0%ni, 98.0%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Cpu23 :  0.7%us,  0.3%sy,  0.0%ni, 98.3%id,  0.0%wa,  0.0%hi,  0.7%si,  0.0%st

Signed-off-by: Lin Feng <linf@wangsu.com>
---
 Documentation/admin-guide/sysctl/vm.rst | 17 +++++++++++++++++
 kernel/sysctl.c                         | 10 ++++++++++
 mm/vmscan.c                             | 12 +++++++++---
 3 files changed, 36 insertions(+), 3 deletions(-)

Comments

Matthew Wilcox Sept. 17, 2019, 12:06 p.m. UTC | #1
On Tue, Sep 17, 2019 at 07:58:24PM +0800, Lin Feng wrote:
> In direct and background(kswapd) pages reclaim paths both may fall into
> calling msleep(100) or congestion_wait(HZ/10) or wait_iff_congested(HZ/10)
> while under IO pressure, and the sleep length is hard-coded and the later
> two will introduce 100ms iowait length per time.
> 
> So if pages reclaim is relatively active in some circumstances such as high
> order pages reappings, it's possible to see a lot of iowait introduced by
> congestion_wait(HZ/10) and wait_iff_congested(HZ/10).
> 
> The 100ms sleep length is proper if the backing drivers are slow like
> traditionnal rotation disks. While if the backing drivers are high-end
> storages such as high iops ssds or even faster drivers, the high iowait
> inroduced by pages reclaim is really misleading, because the storage IO
> utils seen by iostat is quite low, in this case the congestion_wait time
> modified to 1ms is likely enough for high-end ssds.
> 
> Another benifit is that it's potentially shorter the direct reclaim blocked
> time when kernel falls into sync reclaim path, which may improve user
> applications response time.

This is a great description of the problem.

> +mm_reclaim_congestion_wait_jiffies
> +==========
> +
> +This control is used to define how long kernel will wait/sleep while
> +system memory is under pressure and memroy reclaim is relatively active.
> +Lower values will decrease the kernel wait/sleep time.
> +
> +It's suggested to lower this value on high-end box that system is under memory
> +pressure but with low storage IO utils and high CPU iowait, which could also
> +potentially decrease user application response time in this case.
> +
> +Keep this control as it were if your box are not above case.
> +
> +The default value is HZ/10, which is of equal value to 100ms independ of how
> +many HZ is defined.

Adding a new tunable is not the right solution.  The right way is
to make Linux auto-tune itself to avoid the problem.  For example,
bdi_writeback contains an estimated write bandwidth (calculated by the
memory management layer).  Given that, we should be able to make an
estimate for how long to wait for the queues to drain.
Lin Feng Sept. 18, 2019, 3:21 a.m. UTC | #2
On 9/17/19 20:06, Matthew Wilcox wrote:
> On Tue, Sep 17, 2019 at 07:58:24PM +0800, Lin Feng wrote:
>> In direct and background(kswapd) pages reclaim paths both may fall into
>> calling msleep(100) or congestion_wait(HZ/10) or wait_iff_congested(HZ/10)
>> while under IO pressure, and the sleep length is hard-coded and the later
>> two will introduce 100ms iowait length per time.
>>
>> So if pages reclaim is relatively active in some circumstances such as high
>> order pages reappings, it's possible to see a lot of iowait introduced by
>> congestion_wait(HZ/10) and wait_iff_congested(HZ/10).
>>
>> The 100ms sleep length is proper if the backing drivers are slow like
>> traditionnal rotation disks. While if the backing drivers are high-end
>> storages such as high iops ssds or even faster drivers, the high iowait
>> inroduced by pages reclaim is really misleading, because the storage IO
>> utils seen by iostat is quite low, in this case the congestion_wait time
>> modified to 1ms is likely enough for high-end ssds.
>>
>> Another benifit is that it's potentially shorter the direct reclaim blocked
>> time when kernel falls into sync reclaim path, which may improve user
>> applications response time.
> 
> This is a great description of the problem.
The always 100ms blocked time sometimes is not necessary :)

> 
>> +mm_reclaim_congestion_wait_jiffies
>> +==========
>> +
>> +This control is used to define how long kernel will wait/sleep while
>> +system memory is under pressure and memroy reclaim is relatively active.
>> +Lower values will decrease the kernel wait/sleep time.
>> +
>> +It's suggested to lower this value on high-end box that system is under memory
>> +pressure but with low storage IO utils and high CPU iowait, which could also
>> +potentially decrease user application response time in this case.
>> +
>> +Keep this control as it were if your box are not above case.
>> +
>> +The default value is HZ/10, which is of equal value to 100ms independ of how
>> +many HZ is defined.
> 
> Adding a new tunable is not the right solution.  The right way is
> to make Linux auto-tune itself to avoid the problem.  For example,
> bdi_writeback contains an estimated write bandwidth (calculated by the
> memory management layer).  Given that, we should be able to make an
> estimate for how long to wait for the queues to drain.
> 

Yes, I had ever considered that, auto-tuning is definitely the senior AI way.
While considering all kinds of production environments hybird storage solution
is also common today, servers' dirty pages' bdi drivers can span from high end
ssds to low end sata disk, so we have to think of a *formula(AI core)* by using
the factors of dirty pages' amount and bdis' write bandwidth, and this AI-core
will depend on if the estimated write bandwidth is sane and moreover the to be
written back dirty pages is sequential or random if the bdi is rotational disk,
it's likey to give a not-sane number and hurt guys who dont't want that, while
if only consider ssd is relatively simple.

So IMHO it's not sane to brute force add a guessing logic into memory writeback
codes and pray on inventing a formula that caters everyone's need.
Add a sysctl entry may be a right choice that give people who need it and
doesn't hurt people who don't want it.

thanks,
linfeng
Matthew Wilcox Sept. 18, 2019, 11:38 a.m. UTC | #3
On Wed, Sep 18, 2019 at 11:21:04AM +0800, Lin Feng wrote:
> > Adding a new tunable is not the right solution.  The right way is
> > to make Linux auto-tune itself to avoid the problem.  For example,
> > bdi_writeback contains an estimated write bandwidth (calculated by the
> > memory management layer).  Given that, we should be able to make an
> > estimate for how long to wait for the queues to drain.
> > 
> 
> Yes, I had ever considered that, auto-tuning is definitely the senior AI way.
> While considering all kinds of production environments hybird storage solution
> is also common today, servers' dirty pages' bdi drivers can span from high end
> ssds to low end sata disk, so we have to think of a *formula(AI core)* by using
> the factors of dirty pages' amount and bdis' write bandwidth, and this AI-core
> will depend on if the estimated write bandwidth is sane and moreover the to be
> written back dirty pages is sequential or random if the bdi is rotational disk,
> it's likey to give a not-sane number and hurt guys who dont't want that, while
> if only consider ssd is relatively simple.
> 
> So IMHO it's not sane to brute force add a guessing logic into memory writeback
> codes and pray on inventing a formula that caters everyone's need.
> Add a sysctl entry may be a right choice that give people who need it and
> doesn't hurt people who don't want it.

You're making this sound far harder than it is.  All the writeback code
needs to know is "How long should I sleep for in order for the queues
to drain a substantial amount".  Since you know the bandwidth and how
many pages you've queued up, it's a simple calculation.
Michal Hocko Sept. 18, 2019, 12:33 p.m. UTC | #4
On Tue 17-09-19 05:06:46, Matthew Wilcox wrote:
> On Tue, Sep 17, 2019 at 07:58:24PM +0800, Lin Feng wrote:
[...]
> > +mm_reclaim_congestion_wait_jiffies
> > +==========
> > +
> > +This control is used to define how long kernel will wait/sleep while
> > +system memory is under pressure and memroy reclaim is relatively active.
> > +Lower values will decrease the kernel wait/sleep time.
> > +
> > +It's suggested to lower this value on high-end box that system is under memory
> > +pressure but with low storage IO utils and high CPU iowait, which could also
> > +potentially decrease user application response time in this case.
> > +
> > +Keep this control as it were if your box are not above case.
> > +
> > +The default value is HZ/10, which is of equal value to 100ms independ of how
> > +many HZ is defined.
> 
> Adding a new tunable is not the right solution.  The right way is
> to make Linux auto-tune itself to avoid the problem.

I absolutely agree here. From you changelog it is also not clear what is
the underlying problem. Both congestion_wait and wait_iff_congested
should wake up early if the congestion is handled. Is this not the case?
Why? Are you sure a shorter timeout is not just going to cause problems
elsewhere. These sleeps are used to throttle the reclaim. I do agree
there is no great deal of design behind them so they are more of "let's
hope it works" kinda thing but making their timeout configurable just
doesn't solve this at all. You are effectively exporting a very subtle
implementation detail into the userspace.
Lin Feng Sept. 19, 2019, 2:20 a.m. UTC | #5
Hi,

On 9/18/19 19:38, Matthew Wilcox wrote:
> On Wed, Sep 18, 2019 at 11:21:04AM +0800, Lin Feng wrote:
>>> Adding a new tunable is not the right solution.  The right way is
>>> to make Linux auto-tune itself to avoid the problem.  For example,
>>> bdi_writeback contains an estimated write bandwidth (calculated by the
>>> memory management layer).  Given that, we should be able to make an
>>> estimate for how long to wait for the queues to drain.
>>>
>>
>> Yes, I had ever considered that, auto-tuning is definitely the senior AI way.
>> While considering all kinds of production environments hybird storage solution
>> is also common today, servers' dirty pages' bdi drivers can span from high end
>> ssds to low end sata disk, so we have to think of a *formula(AI core)* by using
>> the factors of dirty pages' amount and bdis' write bandwidth, and this AI-core
>> will depend on if the estimated write bandwidth is sane and moreover the to be
>> written back dirty pages is sequential or random if the bdi is rotational disk,
>> it's likey to give a not-sane number and hurt guys who dont't want that, while
>> if only consider ssd is relatively simple.
>>
>> So IMHO it's not sane to brute force add a guessing logic into memory writeback
>> codes and pray on inventing a formula that caters everyone's need.
>> Add a sysctl entry may be a right choice that give people who need it and
>> doesn't hurt people who don't want it.
> 
> You're making this sound far harder than it is.  All the writeback code
> needs to know is "How long should I sleep for in order for the queues
> to drain a substantial amount".  Since you know the bandwidth and how
> many pages you've queued up, it's a simple calculation.
> 

Ah, I should have read more of the writeback codes ;-)
Based on Michal's comments:
 > the underlying problem. Both congestion_wait and wait_iff_congested
 > should wake up early if the congestion is handled. Is this not the case?
If process is waken up once bdi congested is clear, this timeout length's role
seems not that important. I need to trace more if I can reproduce this issue
without online network traffic. But still weird thing is that once I set the
people-disliked-tunable iowait drop down instantly, they are contradictory.

Anyway, thanks a lot for your suggestions!
linfeng
Lin Feng Sept. 19, 2019, 2:33 a.m. UTC | #6
On 9/18/19 20:33, Michal Hocko wrote:
>>> +mm_reclaim_congestion_wait_jiffies
>>> +==========
>>> +
>>> +This control is used to define how long kernel will wait/sleep while
>>> +system memory is under pressure and memroy reclaim is relatively active.
>>> +Lower values will decrease the kernel wait/sleep time.
>>> +
>>> +It's suggested to lower this value on high-end box that system is under memory
>>> +pressure but with low storage IO utils and high CPU iowait, which could also
>>> +potentially decrease user application response time in this case.
>>> +
>>> +Keep this control as it were if your box are not above case.
>>> +
>>> +The default value is HZ/10, which is of equal value to 100ms independ of how
>>> +many HZ is defined.
>> Adding a new tunable is not the right solution.  The right way is
>> to make Linux auto-tune itself to avoid the problem.
> I absolutely agree here. From you changelog it is also not clear what is
> the underlying problem. Both congestion_wait and wait_iff_congested
> should wake up early if the congestion is handled. Is this not the case?

For now I don't know why, codes seem should work as you said, maybe I need to
trace more of the internals.
But weird thing is that once I set the people-disliked-tunable iowait
drop down instantly, this is contradictory to the code design.


> Why? Are you sure a shorter timeout is not just going to cause problems
> elsewhere. These sleeps are used to throttle the reclaim. I do agree
> there is no great deal of design behind them so they are more of "let's
> hope it works" kinda thing but making their timeout configurable just
> doesn't solve this at all. You are effectively exporting a very subtle
> implementation detail into the userspace.

Kind of agree, but it does fix the issue at least mine and user response
time also improve in the meantime.
So, just make it as it were and exported to someone needs it..
Matthew Wilcox Sept. 19, 2019, 3:49 a.m. UTC | #7
On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
> On 9/18/19 20:33, Michal Hocko wrote:
> > I absolutely agree here. From you changelog it is also not clear what is
> > the underlying problem. Both congestion_wait and wait_iff_congested
> > should wake up early if the congestion is handled. Is this not the case?
> 
> For now I don't know why, codes seem should work as you said, maybe I need to
> trace more of the internals.
> But weird thing is that once I set the people-disliked-tunable iowait
> drop down instantly, this is contradictory to the code design.

Yes, this is quite strange.  If setting a smaller timeout makes a
difference, that indicates we're not waking up soon enough.  I see
two possibilities; one is that a wakeup is missing somewhere -- ie the
conditions under which we call clear_wb_congested() are wrong.  Or we
need to wake up sooner.

Umm.  We have clear_wb_congested() called from exactly one spot --
clear_bdi_congested().  That is only called from:

drivers/block/pktcdvd.c
fs/ceph/addr.c
fs/fuse/control.c
fs/fuse/dev.c
fs/nfs/write.c

Jens, is something supposed to be calling clear_bdi_congested() in the
block layer?  blk_clear_congested() used to exist until October 29th
last year.  Or is something else supposed to be waking up tasks that
are sleeping on congestion?
Lin Feng Sept. 19, 2019, 7:46 a.m. UTC | #8
On 9/19/19 11:49, Matthew Wilcox wrote:
> On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
>> On 9/18/19 20:33, Michal Hocko wrote:
>>> I absolutely agree here. From you changelog it is also not clear what is
>>> the underlying problem. Both congestion_wait and wait_iff_congested
>>> should wake up early if the congestion is handled. Is this not the case?
>>
>> For now I don't know why, codes seem should work as you said, maybe I need to
>> trace more of the internals.
>> But weird thing is that once I set the people-disliked-tunable iowait
>> drop down instantly, this is contradictory to the code design.
> 
> Yes, this is quite strange.  If setting a smaller timeout makes a
> difference, that indicates we're not waking up soon enough.  I see
> two possibilities; one is that a wakeup is missing somewhere -- ie the
> conditions under which we call clear_wb_congested() are wrong.  Or we
> need to wake up sooner.
> 
> Umm.  We have clear_wb_congested() called from exactly one spot --
> clear_bdi_congested().  That is only called from:
> 
> drivers/block/pktcdvd.c
> fs/ceph/addr.c
> fs/fuse/control.c
> fs/fuse/dev.c
> fs/nfs/write.c
> 
> Jens, is something supposed to be calling clear_bdi_congested() in the
> block layer?  blk_clear_congested() used to exist until October 29th
> last year.  Or is something else supposed to be waking up tasks that
> are sleeping on congestion?
> 

IIUC it looks like after commit a1ce35fa49852db60fc6e268038530be533c5b15,
besides those *.c places as you mentioned above, vmscan codes will always
wait as long as 100ms and nobody wakes them up.

here:
1964         while (unlikely(too_many_isolated(pgdat, file, sc))) {
1965                 if (stalled)
1966                         return 0;
1967
1968                 /* wait a bit for the reclaimer. */
 >1969                 msleep(100);
1970                 stalled = true;
1971
1972                 /* We are about to die and free our memory. Return now. */
1973                 if (fatal_signal_pending(current))
1974                         return SWAP_CLUSTER_MAX;
1975         }

and here:
2784                         /*
2785                          * If kswapd scans pages marked marked for immediate
2786                          * reclaim and under writeback (nr_immediate), it
2787                          * implies that pages are cycling through the LRU
2788                          * faster than they are written so also forcibly stall.
2789                          */
2790                         if (sc->nr.immediate)
 >2791                                 congestion_wait(BLK_RW_ASYNC, HZ/10);
2792                 }

except here, codes where set_bdi_congested will clear_bdi_congested at proper time,
exactly the source files you mentioned above, so it's OK.
2808                 if (!sc->hibernation_mode && !current_is_kswapd() &&
2809                    current_may_throttle() && pgdat_memcg_congested(pgdat, root))
2810                         wait_iff_congested(BLK_RW_ASYNC, HZ/10);
Michal Hocko Sept. 19, 2019, 8:22 a.m. UTC | #9
On Thu 19-09-19 15:46:11, Lin Feng wrote:
> 
> 
> On 9/19/19 11:49, Matthew Wilcox wrote:
> > On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
> > > On 9/18/19 20:33, Michal Hocko wrote:
> > > > I absolutely agree here. From you changelog it is also not clear what is
> > > > the underlying problem. Both congestion_wait and wait_iff_congested
> > > > should wake up early if the congestion is handled. Is this not the case?
> > > 
> > > For now I don't know why, codes seem should work as you said, maybe I need to
> > > trace more of the internals.
> > > But weird thing is that once I set the people-disliked-tunable iowait
> > > drop down instantly, this is contradictory to the code design.
> > 
> > Yes, this is quite strange.  If setting a smaller timeout makes a
> > difference, that indicates we're not waking up soon enough.  I see
> > two possibilities; one is that a wakeup is missing somewhere -- ie the
> > conditions under which we call clear_wb_congested() are wrong.  Or we
> > need to wake up sooner.
> > 
> > Umm.  We have clear_wb_congested() called from exactly one spot --
> > clear_bdi_congested().  That is only called from:
> > 
> > drivers/block/pktcdvd.c
> > fs/ceph/addr.c
> > fs/fuse/control.c
> > fs/fuse/dev.c
> > fs/nfs/write.c
> > 
> > Jens, is something supposed to be calling clear_bdi_congested() in the
> > block layer?  blk_clear_congested() used to exist until October 29th
> > last year.  Or is something else supposed to be waking up tasks that
> > are sleeping on congestion?
> > 
> 
> IIUC it looks like after commit a1ce35fa49852db60fc6e268038530be533c5b15,

This is something for Jens to comment on. Not waiting up on congestion
indeed sounds like a bug.

> besides those *.c places as you mentioned above, vmscan codes will always
> wait as long as 100ms and nobody wakes them up.

Yes this is true but you should realize that this path is triggered only
under heavy memory reclaim cases where there is nothing to reclaim
because there are too many pages already isolated and we are waiting for
reclaimers to make some progress on them. It is also possible that there
are simply no reclaimable pages at all and we are heading the OOM
situation. In both cases waiting a bit shouldn't be critical because
this is really a cold path. It would be much better to have a mechanism
to wake up earlier but this is likely to be non trivial and I am not
sure worth the effort considering how rare this should be.
Matthew Wilcox Sept. 23, 2019, 11:19 a.m. UTC | #10
Ping Jens?

On Wed, Sep 18, 2019 at 08:49:49PM -0700, Matthew Wilcox wrote:
> On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
> > On 9/18/19 20:33, Michal Hocko wrote:
> > > I absolutely agree here. From you changelog it is also not clear what is
> > > the underlying problem. Both congestion_wait and wait_iff_congested
> > > should wake up early if the congestion is handled. Is this not the case?
> > 
> > For now I don't know why, codes seem should work as you said, maybe I need to
> > trace more of the internals.
> > But weird thing is that once I set the people-disliked-tunable iowait
> > drop down instantly, this is contradictory to the code design.
> 
> Yes, this is quite strange.  If setting a smaller timeout makes a
> difference, that indicates we're not waking up soon enough.  I see
> two possibilities; one is that a wakeup is missing somewhere -- ie the
> conditions under which we call clear_wb_congested() are wrong.  Or we
> need to wake up sooner.
> 
> Umm.  We have clear_wb_congested() called from exactly one spot --
> clear_bdi_congested().  That is only called from:
> 
> drivers/block/pktcdvd.c
> fs/ceph/addr.c
> fs/fuse/control.c
> fs/fuse/dev.c
> fs/nfs/write.c
> 
> Jens, is something supposed to be calling clear_bdi_congested() in the
> block layer?  blk_clear_congested() used to exist until October 29th
> last year.  Or is something else supposed to be waking up tasks that
> are sleeping on congestion?
> 
>
Jens Axboe Sept. 23, 2019, 7:38 p.m. UTC | #11
On 9/23/19 5:19 AM, Matthew Wilcox wrote:
> 
> Ping Jens?
> 
> On Wed, Sep 18, 2019 at 08:49:49PM -0700, Matthew Wilcox wrote:
>> On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
>>> On 9/18/19 20:33, Michal Hocko wrote:
>>>> I absolutely agree here. From you changelog it is also not clear what is
>>>> the underlying problem. Both congestion_wait and wait_iff_congested
>>>> should wake up early if the congestion is handled. Is this not the case?
>>>
>>> For now I don't know why, codes seem should work as you said, maybe I need to
>>> trace more of the internals.
>>> But weird thing is that once I set the people-disliked-tunable iowait
>>> drop down instantly, this is contradictory to the code design.
>>
>> Yes, this is quite strange.  If setting a smaller timeout makes a
>> difference, that indicates we're not waking up soon enough.  I see
>> two possibilities; one is that a wakeup is missing somewhere -- ie the
>> conditions under which we call clear_wb_congested() are wrong.  Or we
>> need to wake up sooner.
>>
>> Umm.  We have clear_wb_congested() called from exactly one spot --
>> clear_bdi_congested().  That is only called from:
>>
>> drivers/block/pktcdvd.c
>> fs/ceph/addr.c
>> fs/fuse/control.c
>> fs/fuse/dev.c
>> fs/nfs/write.c
>>
>> Jens, is something supposed to be calling clear_bdi_congested() in the
>> block layer?  blk_clear_congested() used to exist until October 29th
>> last year.  Or is something else supposed to be waking up tasks that
>> are sleeping on congestion?

Congestion isn't there anymore. It was always broken as a concept imho,
since it was inherently racy. We used the old batching mechanism in the
legacy stack to signal it, and it only worked for some devices.
Matthew Wilcox Sept. 23, 2019, 7:45 p.m. UTC | #12
On Mon, Sep 23, 2019 at 01:38:23PM -0600, Jens Axboe wrote:
> On 9/23/19 5:19 AM, Matthew Wilcox wrote:
> > 
> > Ping Jens?
> > 
> > On Wed, Sep 18, 2019 at 08:49:49PM -0700, Matthew Wilcox wrote:
> >> On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
> >>> On 9/18/19 20:33, Michal Hocko wrote:
> >>>> I absolutely agree here. From you changelog it is also not clear what is
> >>>> the underlying problem. Both congestion_wait and wait_iff_congested
> >>>> should wake up early if the congestion is handled. Is this not the case?
> >>>
> >>> For now I don't know why, codes seem should work as you said, maybe I need to
> >>> trace more of the internals.
> >>> But weird thing is that once I set the people-disliked-tunable iowait
> >>> drop down instantly, this is contradictory to the code design.
> >>
> >> Yes, this is quite strange.  If setting a smaller timeout makes a
> >> difference, that indicates we're not waking up soon enough.  I see
> >> two possibilities; one is that a wakeup is missing somewhere -- ie the
> >> conditions under which we call clear_wb_congested() are wrong.  Or we
> >> need to wake up sooner.
> >>
> >> Umm.  We have clear_wb_congested() called from exactly one spot --
> >> clear_bdi_congested().  That is only called from:
> >>
> >> drivers/block/pktcdvd.c
> >> fs/ceph/addr.c
> >> fs/fuse/control.c
> >> fs/fuse/dev.c
> >> fs/nfs/write.c
> >>
> >> Jens, is something supposed to be calling clear_bdi_congested() in the
> >> block layer?  blk_clear_congested() used to exist until October 29th
> >> last year.  Or is something else supposed to be waking up tasks that
> >> are sleeping on congestion?
> 
> Congestion isn't there anymore. It was always broken as a concept imho,
> since it was inherently racy. We used the old batching mechanism in the
> legacy stack to signal it, and it only worked for some devices.

Umm.  OK.  Well, something that used to work is now broken.  So how
should we fix it?  Take a look at shrink_node() in mm/vmscan.c.  If we've
submitted a lot of writes to a device, and overloaded it, we want to
sleep until it's able to take more writes:

                /*
                 * Stall direct reclaim for IO completions if underlying BDIs
                 * and node is congested. Allow kswapd to continue until it
                 * starts encountering unqueued dirty pages or cycling through
                 * the LRU too quickly.
                 */
                if (!sc->hibernation_mode && !current_is_kswapd() &&
                   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
                        wait_iff_congested(BLK_RW_ASYNC, HZ/10);

With a standard block device, that now sleeps until the timeout (100ms)
expires, which is far too long for a modern SSD but is probably tuned
just right for some legacy piece of spinning rust (or indeed a modern
USB stick).  How would the block layer like to indicate to the mm layer
"I am too busy, please let the device work for a bit"?
Jens Axboe Sept. 23, 2019, 7:51 p.m. UTC | #13
On 9/23/19 1:45 PM, Matthew Wilcox wrote:
> On Mon, Sep 23, 2019 at 01:38:23PM -0600, Jens Axboe wrote:
>> On 9/23/19 5:19 AM, Matthew Wilcox wrote:
>>>
>>> Ping Jens?
>>>
>>> On Wed, Sep 18, 2019 at 08:49:49PM -0700, Matthew Wilcox wrote:
>>>> On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
>>>>> On 9/18/19 20:33, Michal Hocko wrote:
>>>>>> I absolutely agree here. From you changelog it is also not clear what is
>>>>>> the underlying problem. Both congestion_wait and wait_iff_congested
>>>>>> should wake up early if the congestion is handled. Is this not the case?
>>>>>
>>>>> For now I don't know why, codes seem should work as you said, maybe I need to
>>>>> trace more of the internals.
>>>>> But weird thing is that once I set the people-disliked-tunable iowait
>>>>> drop down instantly, this is contradictory to the code design.
>>>>
>>>> Yes, this is quite strange.  If setting a smaller timeout makes a
>>>> difference, that indicates we're not waking up soon enough.  I see
>>>> two possibilities; one is that a wakeup is missing somewhere -- ie the
>>>> conditions under which we call clear_wb_congested() are wrong.  Or we
>>>> need to wake up sooner.
>>>>
>>>> Umm.  We have clear_wb_congested() called from exactly one spot --
>>>> clear_bdi_congested().  That is only called from:
>>>>
>>>> drivers/block/pktcdvd.c
>>>> fs/ceph/addr.c
>>>> fs/fuse/control.c
>>>> fs/fuse/dev.c
>>>> fs/nfs/write.c
>>>>
>>>> Jens, is something supposed to be calling clear_bdi_congested() in the
>>>> block layer?  blk_clear_congested() used to exist until October 29th
>>>> last year.  Or is something else supposed to be waking up tasks that
>>>> are sleeping on congestion?
>>
>> Congestion isn't there anymore. It was always broken as a concept imho,
>> since it was inherently racy. We used the old batching mechanism in the
>> legacy stack to signal it, and it only worked for some devices.
> 
> Umm.  OK.  Well, something that used to work is now broken.  So how

It didn't really...

> should we fix it?  Take a look at shrink_node() in mm/vmscan.c.  If we've
> submitted a lot of writes to a device, and overloaded it, we want to
> sleep until it's able to take more writes:
> 
>                  /*
>                   * Stall direct reclaim for IO completions if underlying BDIs
>                   * and node is congested. Allow kswapd to continue until it
>                   * starts encountering unqueued dirty pages or cycling through
>                   * the LRU too quickly.
>                   */
>                  if (!sc->hibernation_mode && !current_is_kswapd() &&
>                     current_may_throttle() && pgdat_memcg_congested(pgdat, root))
>                          wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> 
> With a standard block device, that now sleeps until the timeout (100ms)
> expires, which is far too long for a modern SSD but is probably tuned
> just right for some legacy piece of spinning rust (or indeed a modern
> USB stick).  How would the block layer like to indicate to the mm layer
> "I am too busy, please let the device work for a bit"?

Maybe base the sleep on the bdi write speed? We can't feasibly tell you
if something is congested. It used to sort of work on things like sata
drives, since we'd get congested when we hit the queue limit and that
wasn't THAT far off with reality. Didn't work on SCSI with higher queue
depths, and certainly doesn't work on NVMe where most devices have very
deep queues.

Or we can have something that does "sleep until X requests/MB have been
flushed", something that the vm would actively call. Combined with a
timeout as well, probably.

For the vm case above, it's further complicated by it being global
state. I think you'd be better off just making the delay smaller.  100ms
is an eternity, and 10ms wakeups isn't going to cause any major issues
in terms of CPU usage. If we're calling the above wait_iff_congested(),
it better because we're otherwise SOL.
Michal Hocko Sept. 24, 2019, 12:16 p.m. UTC | #14
On Mon 23-09-19 13:51:21, Jens Axboe wrote:
> On 9/23/19 1:45 PM, Matthew Wilcox wrote:
> > On Mon, Sep 23, 2019 at 01:38:23PM -0600, Jens Axboe wrote:
> >> On 9/23/19 5:19 AM, Matthew Wilcox wrote:
> >>>
> >>> Ping Jens?
> >>>
> >>> On Wed, Sep 18, 2019 at 08:49:49PM -0700, Matthew Wilcox wrote:
> >>>> On Thu, Sep 19, 2019 at 10:33:10AM +0800, Lin Feng wrote:
> >>>>> On 9/18/19 20:33, Michal Hocko wrote:
> >>>>>> I absolutely agree here. From you changelog it is also not clear what is
> >>>>>> the underlying problem. Both congestion_wait and wait_iff_congested
> >>>>>> should wake up early if the congestion is handled. Is this not the case?
> >>>>>
> >>>>> For now I don't know why, codes seem should work as you said, maybe I need to
> >>>>> trace more of the internals.
> >>>>> But weird thing is that once I set the people-disliked-tunable iowait
> >>>>> drop down instantly, this is contradictory to the code design.
> >>>>
> >>>> Yes, this is quite strange.  If setting a smaller timeout makes a
> >>>> difference, that indicates we're not waking up soon enough.  I see
> >>>> two possibilities; one is that a wakeup is missing somewhere -- ie the
> >>>> conditions under which we call clear_wb_congested() are wrong.  Or we
> >>>> need to wake up sooner.
> >>>>
> >>>> Umm.  We have clear_wb_congested() called from exactly one spot --
> >>>> clear_bdi_congested().  That is only called from:
> >>>>
> >>>> drivers/block/pktcdvd.c
> >>>> fs/ceph/addr.c
> >>>> fs/fuse/control.c
> >>>> fs/fuse/dev.c
> >>>> fs/nfs/write.c
> >>>>
> >>>> Jens, is something supposed to be calling clear_bdi_congested() in the
> >>>> block layer?  blk_clear_congested() used to exist until October 29th
> >>>> last year.  Or is something else supposed to be waking up tasks that
> >>>> are sleeping on congestion?
> >>
> >> Congestion isn't there anymore. It was always broken as a concept imho,
> >> since it was inherently racy. We used the old batching mechanism in the
> >> legacy stack to signal it, and it only worked for some devices.
> > 
> > Umm.  OK.  Well, something that used to work is now broken.  So how
> 
> It didn't really...

Maybe it would have been better to tell users who are using interface
that rely on this.
 
> > should we fix it?  Take a look at shrink_node() in mm/vmscan.c.  If we've
> > submitted a lot of writes to a device, and overloaded it, we want to
> > sleep until it's able to take more writes:
> > 
> >                  /*
> >                   * Stall direct reclaim for IO completions if underlying BDIs
> >                   * and node is congested. Allow kswapd to continue until it
> >                   * starts encountering unqueued dirty pages or cycling through
> >                   * the LRU too quickly.
> >                   */
> >                  if (!sc->hibernation_mode && !current_is_kswapd() &&
> >                     current_may_throttle() && pgdat_memcg_congested(pgdat, root))
> >                          wait_iff_congested(BLK_RW_ASYNC, HZ/10);
> > 
> > With a standard block device, that now sleeps until the timeout (100ms)
> > expires, which is far too long for a modern SSD but is probably tuned
> > just right for some legacy piece of spinning rust (or indeed a modern
> > USB stick).  How would the block layer like to indicate to the mm layer
> > "I am too busy, please let the device work for a bit"?
> 
> Maybe base the sleep on the bdi write speed? We can't feasibly tell you
> if something is congested. It used to sort of work on things like sata
> drives, since we'd get congested when we hit the queue limit and that
> wasn't THAT far off with reality. Didn't work on SCSI with higher queue
> depths, and certainly doesn't work on NVMe where most devices have very
> deep queues.
> 
> Or we can have something that does "sleep until X requests/MB have been
> flushed", something that the vm would actively call. Combined with a
> timeout as well, probably.
> 
> For the vm case above, it's further complicated by it being global
> state. I think you'd be better off just making the delay smaller.  100ms
> is an eternity, and 10ms wakeups isn't going to cause any major issues
> in terms of CPU usage. If we're calling the above wait_iff_congested(),
> it better because we're otherwise SOL.

I do not mind using a shorter sleeps. A downside would be that very slow
storages could go OOM sooner because we do not wait long enough on one
side or reintroducing stalls during direct reclaim fixed by 0e093d99763eb.

I do agree that our poor's man congestion feedback mechanism is far from
being great. It would be really great to have something more resembling
a real feedback though. The global aspect of the thing is surely not
helping much but the reclaim is encountering pages from different bdis
and accounting each of them sounds like an overkill and too elaborate to
implement. Would it be possible to have something like "throttle on at
least one too busy bdi"?
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 64aeee1009ca..e4dd83731ecf 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -837,6 +837,23 @@  than the high water mark in a zone.
 The default value is 60.
 
 
+mm_reclaim_congestion_wait_jiffies
+==========
+
+This control is used to define how long kernel will wait/sleep while
+system memory is under pressure and memroy reclaim is relatively active.
+Lower values will decrease the kernel wait/sleep time.
+
+It's suggested to lower this value on high-end box that system is under memory
+pressure but with low storage IO utils and high CPU iowait, which could also
+potentially decrease user application response time in this case.
+
+Keep this control as it were if your box are not above case.
+
+The default value is HZ/10, which is of equal value to 100ms independ of how
+many HZ is defined.
+
+
 unprivileged_userfaultfd
 ========================
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 078950d9605b..064a3da04789 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -114,6 +114,7 @@  extern int pid_max;
 extern int pid_max_min, pid_max_max;
 extern int percpu_pagelist_fraction;
 extern int latencytop_enabled;
+extern int mm_reclaim_congestion_wait_jiffies;
 extern unsigned int sysctl_nr_open_min, sysctl_nr_open_max;
 #ifndef CONFIG_MMU
 extern int sysctl_nr_trim_pages;
@@ -1413,6 +1414,15 @@  static struct ctl_table vm_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &one_hundred,
 	},
+	{
+		.procname	= "mm_reclaim_congestion_wait_jiffies",
+		.data		= &mm_reclaim_congestion_wait_jiffies,
+		.maxlen		= sizeof(mm_reclaim_congestion_wait_jiffies),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &SYSCTL_ONE,
+		.extra2		= &one_hundred,
+	},
 #ifdef CONFIG_HUGETLB_PAGE
 	{
 		.procname	= "nr_hugepages",
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6c5d0b28321..8c19afdcff95 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -165,6 +165,12 @@  struct scan_control {
  * From 0 .. 100.  Higher means more swappy.
  */
 int vm_swappiness = 60;
+
+/*
+ * From 0 .. 100.  Lower means shorter memory reclaim IO congestion wait time.
+ */
+int mm_reclaim_congestion_wait_jiffies = HZ / 10;
+
 /*
  * The total number of pages which are beyond the high watermark within all
  * zones.
@@ -1966,7 +1972,7 @@  shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 			return 0;
 
 		/* wait a bit for the reclaimer. */
-		msleep(100);
+		msleep(jiffies_to_msecs(mm_reclaim_congestion_wait_jiffies));
 		stalled = true;
 
 		/* We are about to die and free our memory. Return now. */
@@ -2788,7 +2794,7 @@  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 			 * faster than they are written so also forcibly stall.
 			 */
 			if (sc->nr.immediate)
-				congestion_wait(BLK_RW_ASYNC, HZ/10);
+				congestion_wait(BLK_RW_ASYNC, mm_reclaim_congestion_wait_jiffies);
 		}
 
 		/*
@@ -2807,7 +2813,7 @@  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 		 */
 		if (!sc->hibernation_mode && !current_is_kswapd() &&
 		   current_may_throttle() && pgdat_memcg_congested(pgdat, root))
-			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+			wait_iff_congested(BLK_RW_ASYNC, mm_reclaim_congestion_wait_jiffies);
 
 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));