diff mbox series

squashfs: enable __GFP_FS in ->readpage to prevent hang in mem alloc

Message ID 20181204020840.49576-1-houtao1@huawei.com (mailing list archive)
State New, archived
Headers show
Series squashfs: enable __GFP_FS in ->readpage to prevent hang in mem alloc | expand

Commit Message

Hou Tao Dec. 4, 2018, 2:08 a.m. UTC
There is no need to disable __GFP_FS in ->readpage:
* It's a read-only fs, so there will be no dirty/writeback page and
  there will be no deadlock against the caller's locked page
* It just allocates one page, so compaction will not be invoked
* It doesn't take any inode lock, so the reclamation of inode will be fine

And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
squashfs page fault occurs in the context of a memory hogger, because
the hogger will not be killed due to the logic in __alloc_pages_may_oom().

Signed-off-by: Hou Tao <houtao1@huawei.com>
---
 fs/squashfs/file.c          |  3 ++-
 fs/squashfs/file_direct.c   |  4 +++-
 fs/squashfs/squashfs_fs_f.h | 25 +++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)
 create mode 100644 fs/squashfs/squashfs_fs_f.h

Comments

Hou Tao Dec. 6, 2018, 1:14 a.m. UTC | #1
ping ?

On 2018/12/4 10:08, Hou Tao wrote:
> There is no need to disable __GFP_FS in ->readpage:
> * It's a read-only fs, so there will be no dirty/writeback page and
>   there will be no deadlock against the caller's locked page
> * It just allocates one page, so compaction will not be invoked
> * It doesn't take any inode lock, so the reclamation of inode will be fine
> 
> And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
> squashfs page fault occurs in the context of a memory hogger, because
> the hogger will not be killed due to the logic in __alloc_pages_may_oom().
> 
> Signed-off-by: Hou Tao <houtao1@huawei.com>
> ---
>  fs/squashfs/file.c          |  3 ++-
>  fs/squashfs/file_direct.c   |  4 +++-
>  fs/squashfs/squashfs_fs_f.h | 25 +++++++++++++++++++++++++
>  3 files changed, 30 insertions(+), 2 deletions(-)
>  create mode 100644 fs/squashfs/squashfs_fs_f.h
> 
> diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
> index f1c1430ae721..8603dda4a719 100644
> --- a/fs/squashfs/file.c
> +++ b/fs/squashfs/file.c
> @@ -51,6 +51,7 @@
>  #include "squashfs_fs.h"
>  #include "squashfs_fs_sb.h"
>  #include "squashfs_fs_i.h"
> +#include "squashfs_fs_f.h"
>  #include "squashfs.h"
>  
>  /*
> @@ -414,7 +415,7 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
>  		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
>  
>  		push_page = (i == page->index) ? page :
> -			grab_cache_page_nowait(page->mapping, i);
> +			squashfs_grab_cache_page_nowait(page->mapping, i);
>  
>  		if (!push_page)
>  			continue;
> diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
> index 80db1b86a27c..a0fdd6215348 100644
> --- a/fs/squashfs/file_direct.c
> +++ b/fs/squashfs/file_direct.c
> @@ -17,6 +17,7 @@
>  #include "squashfs_fs.h"
>  #include "squashfs_fs_sb.h"
>  #include "squashfs_fs_i.h"
> +#include "squashfs_fs_f.h"
>  #include "squashfs.h"
>  #include "page_actor.h"
>  
> @@ -60,7 +61,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
>  	/* Try to grab all the pages covered by the Squashfs block */
>  	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
>  		page[i] = (n == target_page->index) ? target_page :
> -			grab_cache_page_nowait(target_page->mapping, n);
> +			squashfs_grab_cache_page_nowait(
> +					target_page->mapping, n);
>  
>  		if (page[i] == NULL) {
>  			missing_pages++;
> diff --git a/fs/squashfs/squashfs_fs_f.h b/fs/squashfs/squashfs_fs_f.h
> new file mode 100644
> index 000000000000..fc5fb7aeb27d
> --- /dev/null
> +++ b/fs/squashfs/squashfs_fs_f.h
> @@ -0,0 +1,25 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef SQUASHFS_FS_F
> +#define SQUASHFS_FS_F
> +
> +/*
> + * No need to use FGP_NOFS here:
> + * 1. It's a read-only fs, so there will be no dirty/writeback page and
> + *    there will be no deadlock against the caller's locked page.
> + * 2. It just allocates one page, so compaction will not be invoked.
> + * 3. It doesn't take any inode lock, so the reclamation of inode
> + *    will be fine.
> + *
> + * And GFP_NOFS may lead to infinite loop in __alloc_pages_slowpath() if a
> + * squashfs page fault occurs in the context of a memory hogger, because
> + * the hogger will not be killed due to the logic in __alloc_pages_may_oom().
> + */
> +static inline struct page *
> +squashfs_grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
> +{
> +	return pagecache_get_page(mapping, index,
> +			FGP_LOCK|FGP_CREAT|FGP_NOWAIT,
> +			mapping_gfp_mask(mapping));
> +}
> +#endif
> +
>
Hou Tao Dec. 13, 2018, 2:18 a.m. UTC | #2
ping ?

On 2018/12/6 9:14, Hou Tao wrote:
> ping ?
> 
> On 2018/12/4 10:08, Hou Tao wrote:
>> There is no need to disable __GFP_FS in ->readpage:
>> * It's a read-only fs, so there will be no dirty/writeback page and
>>   there will be no deadlock against the caller's locked page
>> * It just allocates one page, so compaction will not be invoked
>> * It doesn't take any inode lock, so the reclamation of inode will be fine
>>
>> And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
>> squashfs page fault occurs in the context of a memory hogger, because
>> the hogger will not be killed due to the logic in __alloc_pages_may_oom().
>>
>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>> ---
>>  fs/squashfs/file.c          |  3 ++-
>>  fs/squashfs/file_direct.c   |  4 +++-
>>  fs/squashfs/squashfs_fs_f.h | 25 +++++++++++++++++++++++++
>>  3 files changed, 30 insertions(+), 2 deletions(-)
>>  create mode 100644 fs/squashfs/squashfs_fs_f.h
>>
>> diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
>> index f1c1430ae721..8603dda4a719 100644
>> --- a/fs/squashfs/file.c
>> +++ b/fs/squashfs/file.c
>> @@ -51,6 +51,7 @@
>>  #include "squashfs_fs.h"
>>  #include "squashfs_fs_sb.h"
>>  #include "squashfs_fs_i.h"
>> +#include "squashfs_fs_f.h"
>>  #include "squashfs.h"
>>  
>>  /*
>> @@ -414,7 +415,7 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
>>  		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
>>  
>>  		push_page = (i == page->index) ? page :
>> -			grab_cache_page_nowait(page->mapping, i);
>> +			squashfs_grab_cache_page_nowait(page->mapping, i);
>>  
>>  		if (!push_page)
>>  			continue;
>> diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
>> index 80db1b86a27c..a0fdd6215348 100644
>> --- a/fs/squashfs/file_direct.c
>> +++ b/fs/squashfs/file_direct.c
>> @@ -17,6 +17,7 @@
>>  #include "squashfs_fs.h"
>>  #include "squashfs_fs_sb.h"
>>  #include "squashfs_fs_i.h"
>> +#include "squashfs_fs_f.h"
>>  #include "squashfs.h"
>>  #include "page_actor.h"
>>  
>> @@ -60,7 +61,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
>>  	/* Try to grab all the pages covered by the Squashfs block */
>>  	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
>>  		page[i] = (n == target_page->index) ? target_page :
>> -			grab_cache_page_nowait(target_page->mapping, n);
>> +			squashfs_grab_cache_page_nowait(
>> +					target_page->mapping, n);
>>  
>>  		if (page[i] == NULL) {
>>  			missing_pages++;
>> diff --git a/fs/squashfs/squashfs_fs_f.h b/fs/squashfs/squashfs_fs_f.h
>> new file mode 100644
>> index 000000000000..fc5fb7aeb27d
>> --- /dev/null
>> +++ b/fs/squashfs/squashfs_fs_f.h
>> @@ -0,0 +1,25 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef SQUASHFS_FS_F
>> +#define SQUASHFS_FS_F
>> +
>> +/*
>> + * No need to use FGP_NOFS here:
>> + * 1. It's a read-only fs, so there will be no dirty/writeback page and
>> + *    there will be no deadlock against the caller's locked page.
>> + * 2. It just allocates one page, so compaction will not be invoked.
>> + * 3. It doesn't take any inode lock, so the reclamation of inode
>> + *    will be fine.
>> + *
>> + * And GFP_NOFS may lead to infinite loop in __alloc_pages_slowpath() if a
>> + * squashfs page fault occurs in the context of a memory hogger, because
>> + * the hogger will not be killed due to the logic in __alloc_pages_may_oom().
>> + */
>> +static inline struct page *
>> +squashfs_grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
>> +{
>> +	return pagecache_get_page(mapping, index,
>> +			FGP_LOCK|FGP_CREAT|FGP_NOWAIT,
>> +			mapping_gfp_mask(mapping));
>> +}
>> +#endif
>> +
>>
> 
> 
> .
>
Hou Tao Dec. 15, 2018, 1:24 p.m. UTC | #3
ping ?

On 2018/12/13 10:18, Hou Tao wrote:
> ping ?
> 
> On 2018/12/6 9:14, Hou Tao wrote:
>> ping ?
>>
>> On 2018/12/4 10:08, Hou Tao wrote:
>>> There is no need to disable __GFP_FS in ->readpage:
>>> * It's a read-only fs, so there will be no dirty/writeback page and
>>>   there will be no deadlock against the caller's locked page
>>> * It just allocates one page, so compaction will not be invoked
>>> * It doesn't take any inode lock, so the reclamation of inode will be fine
>>>
>>> And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
>>> squashfs page fault occurs in the context of a memory hogger, because
>>> the hogger will not be killed due to the logic in __alloc_pages_may_oom().
>>>
>>> Signed-off-by: Hou Tao <houtao1@huawei.com>
>>> ---
>>>  fs/squashfs/file.c          |  3 ++-
>>>  fs/squashfs/file_direct.c   |  4 +++-
>>>  fs/squashfs/squashfs_fs_f.h | 25 +++++++++++++++++++++++++
>>>  3 files changed, 30 insertions(+), 2 deletions(-)
>>>  create mode 100644 fs/squashfs/squashfs_fs_f.h
>>>
>>> diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
>>> index f1c1430ae721..8603dda4a719 100644
>>> --- a/fs/squashfs/file.c
>>> +++ b/fs/squashfs/file.c
>>> @@ -51,6 +51,7 @@
>>>  #include "squashfs_fs.h"
>>>  #include "squashfs_fs_sb.h"
>>>  #include "squashfs_fs_i.h"
>>> +#include "squashfs_fs_f.h"
>>>  #include "squashfs.h"
>>>  
>>>  /*
>>> @@ -414,7 +415,7 @@ void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
>>>  		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
>>>  
>>>  		push_page = (i == page->index) ? page :
>>> -			grab_cache_page_nowait(page->mapping, i);
>>> +			squashfs_grab_cache_page_nowait(page->mapping, i);
>>>  
>>>  		if (!push_page)
>>>  			continue;
>>> diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
>>> index 80db1b86a27c..a0fdd6215348 100644
>>> --- a/fs/squashfs/file_direct.c
>>> +++ b/fs/squashfs/file_direct.c
>>> @@ -17,6 +17,7 @@
>>>  #include "squashfs_fs.h"
>>>  #include "squashfs_fs_sb.h"
>>>  #include "squashfs_fs_i.h"
>>> +#include "squashfs_fs_f.h"
>>>  #include "squashfs.h"
>>>  #include "page_actor.h"
>>>  
>>> @@ -60,7 +61,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
>>>  	/* Try to grab all the pages covered by the Squashfs block */
>>>  	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
>>>  		page[i] = (n == target_page->index) ? target_page :
>>> -			grab_cache_page_nowait(target_page->mapping, n);
>>> +			squashfs_grab_cache_page_nowait(
>>> +					target_page->mapping, n);
>>>  
>>>  		if (page[i] == NULL) {
>>>  			missing_pages++;
>>> diff --git a/fs/squashfs/squashfs_fs_f.h b/fs/squashfs/squashfs_fs_f.h
>>> new file mode 100644
>>> index 000000000000..fc5fb7aeb27d
>>> --- /dev/null
>>> +++ b/fs/squashfs/squashfs_fs_f.h
>>> @@ -0,0 +1,25 @@
>>> +/* SPDX-License-Identifier: GPL-2.0 */
>>> +#ifndef SQUASHFS_FS_F
>>> +#define SQUASHFS_FS_F
>>> +
>>> +/*
>>> + * No need to use FGP_NOFS here:
>>> + * 1. It's a read-only fs, so there will be no dirty/writeback page and
>>> + *    there will be no deadlock against the caller's locked page.
>>> + * 2. It just allocates one page, so compaction will not be invoked.
>>> + * 3. It doesn't take any inode lock, so the reclamation of inode
>>> + *    will be fine.
>>> + *
>>> + * And GFP_NOFS may lead to infinite loop in __alloc_pages_slowpath() if a
>>> + * squashfs page fault occurs in the context of a memory hogger, because
>>> + * the hogger will not be killed due to the logic in __alloc_pages_may_oom().
>>> + */
>>> +static inline struct page *
>>> +squashfs_grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
>>> +{
>>> +	return pagecache_get_page(mapping, index,
>>> +			FGP_LOCK|FGP_CREAT|FGP_NOWAIT,
>>> +			mapping_gfp_mask(mapping));
>>> +}
>>> +#endif
>>> +
>>>
>>
>>
>> .
>>
> 
> 
> .
>
Matthew Wilcox Dec. 15, 2018, 2:38 p.m. UTC | #4
On Tue, Dec 04, 2018 at 10:08:40AM +0800, Hou Tao wrote:
> There is no need to disable __GFP_FS in ->readpage:
> * It's a read-only fs, so there will be no dirty/writeback page and
>   there will be no deadlock against the caller's locked page
> * It just allocates one page, so compaction will not be invoked
> * It doesn't take any inode lock, so the reclamation of inode will be fine
> 
> And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
> squashfs page fault occurs in the context of a memory hogger, because
> the hogger will not be killed due to the logic in __alloc_pages_may_oom().

I don't understand your argument here.  There's a comment in
__alloc_pages_may_oom() saying that we _should_ treat GFP_NOFS
specially, but we currently don't.

        /*
         * XXX: GFP_NOFS allocations should rather fail than rely on
         * other request to make a forward progress.
         * We are in an unfortunate situation where out_of_memory cannot
         * do much for this context but let's try it to at least get
         * access to memory reserved if the current task is killed (see
         * out_of_memory). Once filesystems are ready to handle allocation
         * failures more gracefully we should just bail out here.
         */

What problem are you actually seeing?
Hou Tao Dec. 16, 2018, 9:38 a.m. UTC | #5
Hi,

On 2018/12/15 22:38, Matthew Wilcox wrote:
> On Tue, Dec 04, 2018 at 10:08:40AM +0800, Hou Tao wrote:
>> There is no need to disable __GFP_FS in ->readpage:
>> * It's a read-only fs, so there will be no dirty/writeback page and
>>   there will be no deadlock against the caller's locked page
>> * It just allocates one page, so compaction will not be invoked
>> * It doesn't take any inode lock, so the reclamation of inode will be fine
>>
>> And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
>> squashfs page fault occurs in the context of a memory hogger, because
>> the hogger will not be killed due to the logic in __alloc_pages_may_oom().
> 
> I don't understand your argument here.  There's a comment in
> __alloc_pages_may_oom() saying that we _should_ treat GFP_NOFS
> specially, but we currently don't.
I am trying to say that if __GFP_FS is used in pagecache_get_page() when it tries
to allocate a new page for squashfs, that will be no possibility of dead-lock for
squashfs.

We do treat GFP_NOFS specially in out_of_memory():

    /*
     * The OOM killer does not compensate for IO-less reclaim.
     * pagefault_out_of_memory lost its gfp context so we have to
     * make sure exclude 0 mask - all other users should have at least
     * ___GFP_DIRECT_RECLAIM to get here.
     */
    if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
        return true;

So if GFP_FS is used, no task will be killed because we will return from
out_of_memory() prematurely. And that will lead to an infinite loop in
__alloc_pages_slowpath() as we have observed:

* a squashfs page fault occurred in the context of a memory hogger
* the page used for page fault allocated successfully
* in squashfs_readpage() squashfs will try to allocate other pages
  in the same 128KB block, and __GFP_NOFS is used (actually GFP_HIGHUSER_MOVABLE & ~__GFP_FS)
* in __alloc_pages_slowpath() we can not get any pages through reclamation
  (because most of memory is used by the current task) and we also can not kill
  the current task (due to __GFP_NOFS), and it will loop forever until it's killed.

> 
>         /*
>          * XXX: GFP_NOFS allocations should rather fail than rely on
>          * other request to make a forward progress.
>          * We are in an unfortunate situation where out_of_memory cannot
>          * do much for this context but let's try it to at least get
>          * access to memory reserved if the current task is killed (see
>          * out_of_memory). Once filesystems are ready to handle allocation
>          * failures more gracefully we should just bail out here.
>          */
> 
> What problem are you actually seeing?
> 
> .
>
Matthew Wilcox Dec. 17, 2018, 3:51 a.m. UTC | #6
On Sun, Dec 16, 2018 at 05:38:13PM +0800, Hou Tao wrote:
> Hi,
> 
> On 2018/12/15 22:38, Matthew Wilcox wrote:
> > On Tue, Dec 04, 2018 at 10:08:40AM +0800, Hou Tao wrote:
> >> There is no need to disable __GFP_FS in ->readpage:
> >> * It's a read-only fs, so there will be no dirty/writeback page and
> >>   there will be no deadlock against the caller's locked page
> >> * It just allocates one page, so compaction will not be invoked
> >> * It doesn't take any inode lock, so the reclamation of inode will be fine
> >>
> >> And no __GFP_FS may lead to hang in __alloc_pages_slowpath() if a
> >> squashfs page fault occurs in the context of a memory hogger, because
> >> the hogger will not be killed due to the logic in __alloc_pages_may_oom().
> > 
> > I don't understand your argument here.  There's a comment in
> > __alloc_pages_may_oom() saying that we _should_ treat GFP_NOFS
> > specially, but we currently don't.
> I am trying to say that if __GFP_FS is used in pagecache_get_page() when it tries
> to allocate a new page for squashfs, that will be no possibility of dead-lock for
> squashfs.
> 
> We do treat GFP_NOFS specially in out_of_memory():
> 
>     /*
>      * The OOM killer does not compensate for IO-less reclaim.
>      * pagefault_out_of_memory lost its gfp context so we have to
>      * make sure exclude 0 mask - all other users should have at least
>      * ___GFP_DIRECT_RECLAIM to get here.
>      */
>     if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
>         return true;
> 
> So if GFP_FS is used, no task will be killed because we will return from
> out_of_memory() prematurely. And that will lead to an infinite loop in
> __alloc_pages_slowpath() as we have observed:
> 
> * a squashfs page fault occurred in the context of a memory hogger
> * the page used for page fault allocated successfully
> * in squashfs_readpage() squashfs will try to allocate other pages
>   in the same 128KB block, and __GFP_NOFS is used (actually GFP_HIGHUSER_MOVABLE & ~__GFP_FS)
> * in __alloc_pages_slowpath() we can not get any pages through reclamation
>   (because most of memory is used by the current task) and we also can not kill
>   the current task (due to __GFP_NOFS), and it will loop forever until it's killed.

Ah, yes, that makes perfect sense.  Thank you for the explanation.

I wonder if the correct fix, however, is not to move the check for
GFP_NOFS in out_of_memory() down to below the check whether to kill
the current task.  That would solve your problem, and I don't _think_
it would cause any new ones.  Michal, you touched this code last, what
do you think?
Michal Hocko Dec. 17, 2018, 9:33 a.m. UTC | #7
On Sun 16-12-18 19:51:57, Matthew Wilcox wrote:
[...]
> Ah, yes, that makes perfect sense.  Thank you for the explanation.
> 
> I wonder if the correct fix, however, is not to move the check for
> GFP_NOFS in out_of_memory() down to below the check whether to kill
> the current task.  That would solve your problem, and I don't _think_
> it would cause any new ones.  Michal, you touched this code last, what
> do you think?

What do you mean exactly? Whether we kill a current task or something
else doesn't change much on the fact that NOFS is a reclaim restricted
context and we might kill too early. If the fs can do GFP_FS then it is
obviously a better thing to do because FS metadata can be reclaimed as
well and therefore there is potentially less memory pressure on
application data.
Tetsuo Handa Dec. 17, 2018, 10:51 a.m. UTC | #8
On 2018/12/17 18:33, Michal Hocko wrote:
> On Sun 16-12-18 19:51:57, Matthew Wilcox wrote:
> [...]
>> Ah, yes, that makes perfect sense.  Thank you for the explanation.
>>
>> I wonder if the correct fix, however, is not to move the check for
>> GFP_NOFS in out_of_memory() down to below the check whether to kill
>> the current task.  That would solve your problem, and I don't _think_
>> it would cause any new ones.  Michal, you touched this code last, what
>> do you think?
> 
> What do you mean exactly? Whether we kill a current task or something
> else doesn't change much on the fact that NOFS is a reclaim restricted
> context and we might kill too early. If the fs can do GFP_FS then it is
> obviously a better thing to do because FS metadata can be reclaimed as
> well and therefore there is potentially less memory pressure on
> application data.
> 

I interpreted "to move the check for GFP_NOFS in out_of_memory() down to
below the check whether to kill the current task" as

@@ -1077,15 +1077,6 @@ bool out_of_memory(struct oom_control *oc)
 	}
 
 	/*
-	 * The OOM killer does not compensate for IO-less reclaim.
-	 * pagefault_out_of_memory lost its gfp context so we have to
-	 * make sure exclude 0 mask - all other users should have at least
-	 * ___GFP_DIRECT_RECLAIM to get here.
-	 */
-	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
-		return true;
-
-	/*
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA and memcg) that may require different handling.
 	 */
@@ -1104,6 +1095,19 @@ bool out_of_memory(struct oom_control *oc)
 	}
 
 	select_bad_process(oc);
+
+	/*
+	 * The OOM killer does not compensate for IO-less reclaim.
+	 * pagefault_out_of_memory lost its gfp context so we have to
+	 * make sure exclude 0 mask - all other users should have at least
+	 * ___GFP_DIRECT_RECLAIM to get here.
+	 */
+	if ((oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) && oc->chosen &&
+	    oc->chosen != (void *)-1UL && oc->chosen != current) {
+		put_task_struct(oc->chosen);
+		return true;
+	}
+
 	/* Found nothing?!?! */
 	if (!oc->chosen) {
 		dump_header(oc, NULL);

which is prefixed by "the correct fix is not".

Behaving like sysctl_oom_kill_allocating_task == 1 if __GFP_FS is not used
will not be the correct fix. But ...

Hou Tao wrote:
> There is no need to disable __GFP_FS in ->readpage:
> * It's a read-only fs, so there will be no dirty/writeback page and
>   there will be no deadlock against the caller's locked page

is read-only filesystem sufficient for safe to use __GFP_FS?

Isn't "whether it is safe to use __GFP_FS" depends on "whether fs locks
are held or not" rather than "whether fs has dirty/writeback page or not" ?
Matthew Wilcox Dec. 17, 2018, 12:25 p.m. UTC | #9
On Mon, Dec 17, 2018 at 07:51:27PM +0900, Tetsuo Handa wrote:
> On 2018/12/17 18:33, Michal Hocko wrote:
> > On Sun 16-12-18 19:51:57, Matthew Wilcox wrote:
> > [...]
> >> Ah, yes, that makes perfect sense.  Thank you for the explanation.
> >>
> >> I wonder if the correct fix, however, is not to move the check for
> >> GFP_NOFS in out_of_memory() down to below the check whether to kill
> >> the current task.  That would solve your problem, and I don't _think_
> >> it would cause any new ones.  Michal, you touched this code last, what
> >> do you think?
> > 
> > What do you mean exactly? Whether we kill a current task or something
> > else doesn't change much on the fact that NOFS is a reclaim restricted
> > context and we might kill too early. If the fs can do GFP_FS then it is
> > obviously a better thing to do because FS metadata can be reclaimed as
> > well and therefore there is potentially less memory pressure on
> > application data.
> > 
> 
> I interpreted "to move the check for GFP_NOFS in out_of_memory() down to
> below the check whether to kill the current task" as

Too far; I meant one line earlier, before we try to select a different
process.

> @@ -1104,6 +1095,19 @@ bool out_of_memory(struct oom_control *oc)
>  	}
>  
>  	select_bad_process(oc);
> +
> +	/*
> +	 * The OOM killer does not compensate for IO-less reclaim.
> +	 * pagefault_out_of_memory lost its gfp context so we have to
> +	 * make sure exclude 0 mask - all other users should have at least
> +	 * ___GFP_DIRECT_RECLAIM to get here.
> +	 */
> +	if ((oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) && oc->chosen &&
> +	    oc->chosen != (void *)-1UL && oc->chosen != current) {
> +		put_task_struct(oc->chosen);
> +		return true;
> +	}
> +
>  	/* Found nothing?!?! */
>  	if (!oc->chosen) {
>  		dump_header(oc, NULL);
> 
> which is prefixed by "the correct fix is not".
> 
> Behaving like sysctl_oom_kill_allocating_task == 1 if __GFP_FS is not used
> will not be the correct fix. But ...
> 
> Hou Tao wrote:
> > There is no need to disable __GFP_FS in ->readpage:
> > * It's a read-only fs, so there will be no dirty/writeback page and
> >   there will be no deadlock against the caller's locked page
> 
> is read-only filesystem sufficient for safe to use __GFP_FS?
> 
> Isn't "whether it is safe to use __GFP_FS" depends on "whether fs locks
> are held or not" rather than "whether fs has dirty/writeback page or not" ?

It's worth noticing that squashfs _is_ in fact holding a page locked in
squashfs_copy_cache() when it calls grab_cache_page_nowait().  I'm not
sure if this will lead to trouble or not because I'm insufficiently
familiar with the reclaim path.
Michal Hocko Dec. 17, 2018, 2:10 p.m. UTC | #10
On Mon 17-12-18 04:25:46, Matthew Wilcox wrote:
> On Mon, Dec 17, 2018 at 07:51:27PM +0900, Tetsuo Handa wrote:
> > On 2018/12/17 18:33, Michal Hocko wrote:
> > > On Sun 16-12-18 19:51:57, Matthew Wilcox wrote:
> > > [...]
> > >> Ah, yes, that makes perfect sense.  Thank you for the explanation.
> > >>
> > >> I wonder if the correct fix, however, is not to move the check for
> > >> GFP_NOFS in out_of_memory() down to below the check whether to kill
> > >> the current task.  That would solve your problem, and I don't _think_
> > >> it would cause any new ones.  Michal, you touched this code last, what
> > >> do you think?
> > > 
> > > What do you mean exactly? Whether we kill a current task or something
> > > else doesn't change much on the fact that NOFS is a reclaim restricted
> > > context and we might kill too early. If the fs can do GFP_FS then it is
> > > obviously a better thing to do because FS metadata can be reclaimed as
> > > well and therefore there is potentially less memory pressure on
> > > application data.
> > > 
> > 
> > I interpreted "to move the check for GFP_NOFS in out_of_memory() down to
> > below the check whether to kill the current task" as
> 
> Too far; I meant one line earlier, before we try to select a different
> process.

We could still panic the system on pre-mature OOM. So it doesn't really
seem good.

> > @@ -1104,6 +1095,19 @@ bool out_of_memory(struct oom_control *oc)
> >  	}
> >  
> >  	select_bad_process(oc);
> > +
> > +	/*
> > +	 * The OOM killer does not compensate for IO-less reclaim.
> > +	 * pagefault_out_of_memory lost its gfp context so we have to
> > +	 * make sure exclude 0 mask - all other users should have at least
> > +	 * ___GFP_DIRECT_RECLAIM to get here.
> > +	 */
> > +	if ((oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) && oc->chosen &&
> > +	    oc->chosen != (void *)-1UL && oc->chosen != current) {
> > +		put_task_struct(oc->chosen);
> > +		return true;
> > +	}
> > +
> >  	/* Found nothing?!?! */
> >  	if (!oc->chosen) {
> >  		dump_header(oc, NULL);
> > 
> > which is prefixed by "the correct fix is not".
> > 
> > Behaving like sysctl_oom_kill_allocating_task == 1 if __GFP_FS is not used
> > will not be the correct fix. But ...
> > 
> > Hou Tao wrote:
> > > There is no need to disable __GFP_FS in ->readpage:
> > > * It's a read-only fs, so there will be no dirty/writeback page and
> > >   there will be no deadlock against the caller's locked page
> > 
> > is read-only filesystem sufficient for safe to use __GFP_FS?
> > 
> > Isn't "whether it is safe to use __GFP_FS" depends on "whether fs locks
> > are held or not" rather than "whether fs has dirty/writeback page or not" ?
> 
> It's worth noticing that squashfs _is_ in fact holding a page locked in
> squashfs_copy_cache() when it calls grab_cache_page_nowait().  I'm not
> sure if this will lead to trouble or not because I'm insufficiently
> familiar with the reclaim path.

Hmm, this is more interesting then. If there is any memcg accounted
allocation down that path _and_ the squashfs writeout can lock more
pages and mark them writeback before they are really sent to the storage
then we have a problem. See [1]

[1] http://lkml.kernel.org/r/20181213092221.27270-1-mhocko@kernel.org
Matthew Wilcox Dec. 17, 2018, 2:41 p.m. UTC | #11
On Mon, Dec 17, 2018 at 03:10:44PM +0100, Michal Hocko wrote:
> On Mon 17-12-18 04:25:46, Matthew Wilcox wrote:
> > It's worth noticing that squashfs _is_ in fact holding a page locked in
> > squashfs_copy_cache() when it calls grab_cache_page_nowait().  I'm not
> > sure if this will lead to trouble or not because I'm insufficiently
> > familiar with the reclaim path.
> 
> Hmm, this is more interesting then. If there is any memcg accounted
> allocation down that path _and_ the squashfs writeout can lock more
> pages and mark them writeback before they are really sent to the storage
> then we have a problem. See [1]
> 
> [1] http://lkml.kernel.org/r/20181213092221.27270-1-mhocko@kernel.org

Squashfs is read only, so it'll never have dirty pages and never do
writeout.

But ... maybe the GFP flags being used for grab_cache_page_nowait() are
wrong.  It does, after all, say "nowait".  Perhaps it shouldn't be trying
direct reclaim at all, but rather fail earlier.  Like this:

+++ b/mm/filemap.c
@@ -1550,6 +1550,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
                        gfp_mask |= __GFP_WRITE;
                if (fgp_flags & FGP_NOFS)
                        gfp_mask &= ~__GFP_FS;
+               if (fgp_flags & FGP_NOWAIT)
+                       gfp_mask &= ~__GFP_DIRECT_RECLAIM;
 
                page = __page_cache_alloc(gfp_mask);
                if (!page)
Michal Hocko Dec. 17, 2018, 2:49 p.m. UTC | #12
On Mon 17-12-18 06:41:01, Matthew Wilcox wrote:
> On Mon, Dec 17, 2018 at 03:10:44PM +0100, Michal Hocko wrote:
> > On Mon 17-12-18 04:25:46, Matthew Wilcox wrote:
> > > It's worth noticing that squashfs _is_ in fact holding a page locked in
> > > squashfs_copy_cache() when it calls grab_cache_page_nowait().  I'm not
> > > sure if this will lead to trouble or not because I'm insufficiently
> > > familiar with the reclaim path.
> > 
> > Hmm, this is more interesting then. If there is any memcg accounted
> > allocation down that path _and_ the squashfs writeout can lock more
> > pages and mark them writeback before they are really sent to the storage
> > then we have a problem. See [1]
> > 
> > [1] http://lkml.kernel.org/r/20181213092221.27270-1-mhocko@kernel.org
> 
> Squashfs is read only, so it'll never have dirty pages and never do
> writeout.
> 
> But ... maybe the GFP flags being used for grab_cache_page_nowait() are
> wrong.  It does, after all, say "nowait".  Perhaps it shouldn't be trying
> direct reclaim at all, but rather fail earlier.  Like this:
> 
> +++ b/mm/filemap.c
> @@ -1550,6 +1550,8 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
>                         gfp_mask |= __GFP_WRITE;
>                 if (fgp_flags & FGP_NOFS)
>                         gfp_mask &= ~__GFP_FS;
> +               if (fgp_flags & FGP_NOWAIT)
> +                       gfp_mask &= ~__GFP_DIRECT_RECLAIM;
>  
>                 page = __page_cache_alloc(gfp_mask);
>                 if (!page)

Isn't FGP_NOWAIT about page lock rather than the allocation context?
Hou Tao Dec. 18, 2018, 6:06 a.m. UTC | #13
Hi,

On 2018/12/17 18:51, Tetsuo Handa wrote:
> On 2018/12/17 18:33, Michal Hocko wrote:
>> On Sun 16-12-18 19:51:57, Matthew Wilcox wrote:
>> [...]
>>> Ah, yes, that makes perfect sense.  Thank you for the explanation.
>>>
>>> I wonder if the correct fix, however, is not to move the check for
>>> GFP_NOFS in out_of_memory() down to below the check whether to kill
>>> the current task.  That would solve your problem, and I don't _think_
>>> it would cause any new ones.  Michal, you touched this code last, what
>>> do you think?
>>
>> What do you mean exactly? Whether we kill a current task or something
>> else doesn't change much on the fact that NOFS is a reclaim restricted
>> context and we might kill too early. If the fs can do GFP_FS then it is
>> obviously a better thing to do because FS metadata can be reclaimed as
>> well and therefore there is potentially less memory pressure on
>> application data.
>>
> 
> I interpreted "to move the check for GFP_NOFS in out_of_memory() down to
> below the check whether to kill the current task" as
> 
> @@ -1077,15 +1077,6 @@ bool out_of_memory(struct oom_control *oc)
>  	}
>  
>  	/*
> -	 * The OOM killer does not compensate for IO-less reclaim.
> -	 * pagefault_out_of_memory lost its gfp context so we have to
> -	 * make sure exclude 0 mask - all other users should have at least
> -	 * ___GFP_DIRECT_RECLAIM to get here.
> -	 */
> -	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
> -		return true;
> -
> -	/*
>  	 * Check if there were limitations on the allocation (only relevant for
>  	 * NUMA and memcg) that may require different handling.
>  	 */
> @@ -1104,6 +1095,19 @@ bool out_of_memory(struct oom_control *oc)
>  	}
>  
>  	select_bad_process(oc);
> +
> +	/*
> +	 * The OOM killer does not compensate for IO-less reclaim.
> +	 * pagefault_out_of_memory lost its gfp context so we have to
> +	 * make sure exclude 0 mask - all other users should have at least
> +	 * ___GFP_DIRECT_RECLAIM to get here.
> +	 */
> +	if ((oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) && oc->chosen &&
> +	    oc->chosen != (void *)-1UL && oc->chosen != current) {
> +		put_task_struct(oc->chosen);
> +		return true;
> +	}
> +
>  	/* Found nothing?!?! */
>  	if (!oc->chosen) {
>  		dump_header(oc, NULL);
> 
> which is prefixed by "the correct fix is not".
> 
> Behaving like sysctl_oom_kill_allocating_task == 1 if __GFP_FS is not used
> will not be the correct fix. But ...
> 
> Hou Tao wrote:
>> There is no need to disable __GFP_FS in ->readpage:
>> * It's a read-only fs, so there will be no dirty/writeback page and
>>   there will be no deadlock against the caller's locked page
> 
> is read-only filesystem sufficient for safe to use __GFP_FS?
> 
> Isn't "whether it is safe to use __GFP_FS" depends on "whether fs locks
> are held or not" rather than "whether fs has dirty/writeback page or not" ?
> 
In my understanding (correct me if I am wrong), there are three ways through which
reclamation will invoked fs related code and may cause dead-lock:

(1) write-back dirty pages. Not possible for squashfs.
(2) the reclamation of inodes & dentries. The current file is in-use, so it will be not
    reclaimed, and for other reclaimable inodes, squashfs_destroy_inode() will
    be invoked and it doesn't take any locks.
(3) customized shrinker defined by fs. No customized shrinker in squashfs.

So my point is that even a page lock is already held by squashfs_readpage() and
reclamation invokes back to squashfs code, there will be no dead-lock, so it's
safe to use __GFP_FS.

Regards,
Tao

> .
>
Michal Hocko Dec. 18, 2018, 11:32 a.m. UTC | #14
On Tue 18-12-18 14:06:11, Hou Tao wrote:
[...]
> In my understanding (correct me if I am wrong), there are three ways through which
> reclamation will invoked fs related code and may cause dead-lock:
> 
> (1) write-back dirty pages. Not possible for squashfs.

only from kswapd context. So not relevant to OOM killer/

> (2) the reclamation of inodes & dentries. The current file is in-use, so it will be not
>     reclaimed, and for other reclaimable inodes, squashfs_destroy_inode() will
>     be invoked and it doesn't take any locks.

There are other inodes, not only those in use. Do you use any locks that
could be taken from an inode teardown?
diff mbox series

Patch

diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index f1c1430ae721..8603dda4a719 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -51,6 +51,7 @@ 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
+#include "squashfs_fs_f.h"
 #include "squashfs.h"
 
 /*
@@ -414,7 +415,7 @@  void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
 		TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
 
 		push_page = (i == page->index) ? page :
-			grab_cache_page_nowait(page->mapping, i);
+			squashfs_grab_cache_page_nowait(page->mapping, i);
 
 		if (!push_page)
 			continue;
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 80db1b86a27c..a0fdd6215348 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -17,6 +17,7 @@ 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
+#include "squashfs_fs_f.h"
 #include "squashfs.h"
 #include "page_actor.h"
 
@@ -60,7 +61,8 @@  int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
 	/* Try to grab all the pages covered by the Squashfs block */
 	for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
 		page[i] = (n == target_page->index) ? target_page :
-			grab_cache_page_nowait(target_page->mapping, n);
+			squashfs_grab_cache_page_nowait(
+					target_page->mapping, n);
 
 		if (page[i] == NULL) {
 			missing_pages++;
diff --git a/fs/squashfs/squashfs_fs_f.h b/fs/squashfs/squashfs_fs_f.h
new file mode 100644
index 000000000000..fc5fb7aeb27d
--- /dev/null
+++ b/fs/squashfs/squashfs_fs_f.h
@@ -0,0 +1,25 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef SQUASHFS_FS_F
+#define SQUASHFS_FS_F
+
+/*
+ * No need to use FGP_NOFS here:
+ * 1. It's a read-only fs, so there will be no dirty/writeback page and
+ *    there will be no deadlock against the caller's locked page.
+ * 2. It just allocates one page, so compaction will not be invoked.
+ * 3. It doesn't take any inode lock, so the reclamation of inode
+ *    will be fine.
+ *
+ * And GFP_NOFS may lead to infinite loop in __alloc_pages_slowpath() if a
+ * squashfs page fault occurs in the context of a memory hogger, because
+ * the hogger will not be killed due to the logic in __alloc_pages_may_oom().
+ */
+static inline struct page *
+squashfs_grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
+{
+	return pagecache_get_page(mapping, index,
+			FGP_LOCK|FGP_CREAT|FGP_NOWAIT,
+			mapping_gfp_mask(mapping));
+}
+#endif
+