diff mbox series

Add prctl support for controlling PF_MEMALLOC V2

Message ID 20191021214137.8172-1-mchristi@redhat.com (mailing list archive)
State Not Applicable
Headers show
Series Add prctl support for controlling PF_MEMALLOC V2 | expand

Commit Message

Mike Christie Oct. 21, 2019, 9:41 p.m. UTC
There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
amd nbd that have userspace components that can run in the IO path. For
example, iscsi and nbd's userspace deamons may need to recreate a socket
and/or send IO on it, and dm-multipath's daemon multipathd may need to
send IO to figure out the state of paths and re-set them up.

In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
memalloc_*_save/restore functions to control the allocation behavior,
but for userspace we would end up hitting a allocation that ended up
writing data back to the same device we are trying to allocate for.

This patch allows the userspace deamon to set the PF_MEMALLOC* flags
with prctl during their initialization so later allocations cannot
calling back into them.

Signed-off-by: Mike Christie <mchristi@redhat.com>
---

V2:
- Use prctl instead of procfs.
- Add support for NOFS for fuse.
- Check permissions.

 include/uapi/linux/prctl.h |  8 +++++++
 kernel/sys.c               | 44 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

Comments

Dave Chinner Oct. 21, 2019, 10:52 p.m. UTC | #1
On Mon, Oct 21, 2019 at 04:41:37PM -0500, Mike Christie wrote:
> There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
> amd nbd that have userspace components that can run in the IO path. For
> example, iscsi and nbd's userspace deamons may need to recreate a socket
> and/or send IO on it, and dm-multipath's daemon multipathd may need to
> send IO to figure out the state of paths and re-set them up.
> 
> In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
> memalloc_*_save/restore functions to control the allocation behavior,
> but for userspace we would end up hitting a allocation that ended up
> writing data back to the same device we are trying to allocate for.

I think this needs to describe the symptoms this results in. i.e.
that this can result in deadlocking the IO path.

> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
> with prctl during their initialization so later allocations cannot
> calling back into them.
> 
> Signed-off-by: Mike Christie <mchristi@redhat.com>
> ---

....
> +	case PR_SET_MEMALLOC:
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EPERM;

Wouldn't CAP_SYS_RAWIO (because it's required by kernel IO path
drivers) or CAP_SYS_RESOURCE (controlling memory allocation
behaviour) be more appropriate here?

Which-ever is selected, the use should be added to the list above
the definition of the capability in include/linux/capability.h...

Otherwise looks fine to me.

Cheers,

Dave.
Michal Hocko Oct. 22, 2019, 11:24 a.m. UTC | #2
On Mon 21-10-19 16:41:37, Mike Christie wrote:
> There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
> amd nbd that have userspace components that can run in the IO path. For
> example, iscsi and nbd's userspace deamons may need to recreate a socket
> and/or send IO on it, and dm-multipath's daemon multipathd may need to
> send IO to figure out the state of paths and re-set them up.
> 
> In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
> memalloc_*_save/restore functions to control the allocation behavior,
> but for userspace we would end up hitting a allocation that ended up
> writing data back to the same device we are trying to allocate for.

Which code paths are we talking about here? Any ioctl or is this a
general syscall path? Can we mark the process in a more generic way?
E.g. we have PF_LESS_THROTTLE (used by nfsd). It doesn't affect the
reclaim recursion but it shows a pattern that doesn't really exhibit
too many internals. Maybe we need PF_IO_FLUSHER or similar?

> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
> with prctl during their initialization so later allocations cannot
> calling back into them.

TBH I am not really happy to export these to the userspace. They are
an internal implementation detail and the userspace shouldn't really
care. So if this is really necessary then we need a very good argumnets
and documentation to make the usage clear.
 
> Signed-off-by: Mike Christie <mchristi@redhat.com>
> ---
> 
> V2:
> - Use prctl instead of procfs.
> - Add support for NOFS for fuse.
> - Check permissions.
> 
>  include/uapi/linux/prctl.h |  8 +++++++
>  kernel/sys.c               | 44 ++++++++++++++++++++++++++++++++++++++
>  2 files changed, 52 insertions(+)
> 
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 7da1b37b27aa..6f6b3af6633a 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -234,4 +234,12 @@ struct prctl_mm_map {
>  #define PR_GET_TAGGED_ADDR_CTRL		56
>  # define PR_TAGGED_ADDR_ENABLE		(1UL << 0)
>  
> +/* Control reclaim behavior when allocating memory */
> +#define PR_SET_MEMALLOC			57
> +#define PR_GET_MEMALLOC			58
> +#define PR_MEMALLOC_SET_NOIO		(1UL << 0)
> +#define PR_MEMALLOC_CLEAR_NOIO		(1UL << 1)
> +#define PR_MEMALLOC_SET_NOFS		(1UL << 2)
> +#define PR_MEMALLOC_CLEAR_NOFS		(1UL << 3)
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/sys.c b/kernel/sys.c
> index a611d1d58c7d..34fedc9fc7e4 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -2486,6 +2486,50 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  			return -EINVAL;
>  		error = GET_TAGGED_ADDR_CTRL();
>  		break;
> +	case PR_SET_MEMALLOC:
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EPERM;
> +
> +		if (arg3 || arg4 || arg5)
> +			return -EINVAL;
> +
> +		switch (arg2) {
> +		case PR_MEMALLOC_SET_NOIO:
> +			if (current->flags & PF_MEMALLOC_NOFS)
> +				return -EINVAL;
> +
> +			current->flags |= PF_MEMALLOC_NOIO;
> +			break;
> +		case PR_MEMALLOC_CLEAR_NOIO:
> +			current->flags &= ~PF_MEMALLOC_NOIO;
> +			break;
> +		case PR_MEMALLOC_SET_NOFS:
> +			if (current->flags & PF_MEMALLOC_NOIO)
> +				return -EINVAL;
> +
> +			current->flags |= PF_MEMALLOC_NOFS;
> +			break;
> +		case PR_MEMALLOC_CLEAR_NOFS:
> +			current->flags &= ~PF_MEMALLOC_NOFS;
> +			break;
> +		default:
> +			return -EINVAL;
> +		}
> +		break;
> +	case PR_GET_MEMALLOC:
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EPERM;
> +
> +		if (arg2 || arg3 || arg4 || arg5)
> +			return -EINVAL;
> +
> +		if (current->flags & PF_MEMALLOC_NOIO)
> +			error = PR_MEMALLOC_SET_NOIO;
> +		else if (current->flags & PF_MEMALLOC_NOFS)
> +			error = PR_MEMALLOC_SET_NOFS;
> +		else
> +			error = 0;
> +		break;
>  	default:
>  		error = -EINVAL;
>  		break;
> -- 
> 2.20.1
>
Mike Christie Oct. 22, 2019, 3:42 p.m. UTC | #3
On 10/21/2019 05:52 PM, Dave Chinner wrote:
> On Mon, Oct 21, 2019 at 04:41:37PM -0500, Mike Christie wrote:
>> There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
>> amd nbd that have userspace components that can run in the IO path. For
>> example, iscsi and nbd's userspace deamons may need to recreate a socket
>> and/or send IO on it, and dm-multipath's daemon multipathd may need to
>> send IO to figure out the state of paths and re-set them up.
>>
>> In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
>> memalloc_*_save/restore functions to control the allocation behavior,
>> but for userspace we would end up hitting a allocation that ended up
>> writing data back to the same device we are trying to allocate for.
> 
> I think this needs to describe the symptoms this results in. i.e.
> that this can result in deadlocking the IO path.
> 
>> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
>> with prctl during their initialization so later allocations cannot
>> calling back into them.
>>
>> Signed-off-by: Mike Christie <mchristi@redhat.com>
>> ---
> 
> ....
>> +	case PR_SET_MEMALLOC:
>> +		if (!capable(CAP_SYS_ADMIN))
>> +			return -EPERM;
> 
> Wouldn't CAP_SYS_RAWIO (because it's required by kernel IO path
> drivers) or CAP_SYS_RESOURCE (controlling memory allocation
> behaviour) be more appropriate here?

I think I misread a review comment last posting. I will use
CAP_SYS_RESROUCE on the next resend if people do not have any objections.

> 
> Which-ever is selected, the use should be added to the list above
> the definition of the capability in include/linux/capability.h...
> 

Will do. Thanks.

> Otherwise looks fine to me.
> 
> Cheers,
> 
> Dave.
>
Mike Christie Oct. 22, 2019, 4:13 p.m. UTC | #4
On 10/22/2019 06:24 AM, Michal Hocko wrote:
> On Mon 21-10-19 16:41:37, Mike Christie wrote:
>> There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
>> amd nbd that have userspace components that can run in the IO path. For
>> example, iscsi and nbd's userspace deamons may need to recreate a socket
>> and/or send IO on it, and dm-multipath's daemon multipathd may need to
>> send IO to figure out the state of paths and re-set them up.
>>
>> In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
>> memalloc_*_save/restore functions to control the allocation behavior,
>> but for userspace we would end up hitting a allocation that ended up
>> writing data back to the same device we are trying to allocate for.
> 
> Which code paths are we talking about here? Any ioctl or is this a
> general syscall path? Can we mark the process in a more generic way?

It depends on the daemon. The common one for example are iscsi and nbd
need network related calls like sendmsg, recvmsg, socket, etc.
tcmu-runner could need the network ones and also read and write when it
does IO to a FS or device. dm-multipath needs the sg io ioctls.


> E.g. we have PF_LESS_THROTTLE (used by nfsd). It doesn't affect the
> reclaim recursion but it shows a pattern that doesn't really exhibit
> too many internals. Maybe we need PF_IO_FLUSHER or similar?

I am not familiar with PF_IO_FLUSHER. If it prevents the recursion
problem then please send me details and I will look into it for the next
posting.

> 
>> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
>> with prctl during their initialization so later allocations cannot
>> calling back into them.
> 
> TBH I am not really happy to export these to the userspace. They are
> an internal implementation detail and the userspace shouldn't really

They care in these cases, because block/fs drivers must be able to make
forward progress during writes. To meet this guarantee kernel block
drivers use mempools and memalloc/GFP flags.

For these userspace components of the block/fs drivers they already do
things normal daemons do not to meet that guarantee like mlock their
memory, disable oom killer, and preallocate resources they have control
over. They have no control over reclaim like the kernel drivers do so
its easy for us to deadlock when memory gets low.

> care. So if this is really necessary then we need a very good argumnets
> and documentation to make the usage clear.
>  
>> Signed-off-by: Mike Christie <mchristi@redhat.com>
>> ---
>>
>> V2:
>> - Use prctl instead of procfs.
>> - Add support for NOFS for fuse.
>> - Check permissions.
>>
>>  include/uapi/linux/prctl.h |  8 +++++++
>>  kernel/sys.c               | 44 ++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 52 insertions(+)
>>
>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>> index 7da1b37b27aa..6f6b3af6633a 100644
>> --- a/include/uapi/linux/prctl.h
>> +++ b/include/uapi/linux/prctl.h
>> @@ -234,4 +234,12 @@ struct prctl_mm_map {
>>  #define PR_GET_TAGGED_ADDR_CTRL		56
>>  # define PR_TAGGED_ADDR_ENABLE		(1UL << 0)
>>  
>> +/* Control reclaim behavior when allocating memory */
>> +#define PR_SET_MEMALLOC			57
>> +#define PR_GET_MEMALLOC			58
>> +#define PR_MEMALLOC_SET_NOIO		(1UL << 0)
>> +#define PR_MEMALLOC_CLEAR_NOIO		(1UL << 1)
>> +#define PR_MEMALLOC_SET_NOFS		(1UL << 2)
>> +#define PR_MEMALLOC_CLEAR_NOFS		(1UL << 3)
>> +
>>  #endif /* _LINUX_PRCTL_H */
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index a611d1d58c7d..34fedc9fc7e4 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -2486,6 +2486,50 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>>  			return -EINVAL;
>>  		error = GET_TAGGED_ADDR_CTRL();
>>  		break;
>> +	case PR_SET_MEMALLOC:
>> +		if (!capable(CAP_SYS_ADMIN))
>> +			return -EPERM;
>> +
>> +		if (arg3 || arg4 || arg5)
>> +			return -EINVAL;
>> +
>> +		switch (arg2) {
>> +		case PR_MEMALLOC_SET_NOIO:
>> +			if (current->flags & PF_MEMALLOC_NOFS)
>> +				return -EINVAL;
>> +
>> +			current->flags |= PF_MEMALLOC_NOIO;
>> +			break;
>> +		case PR_MEMALLOC_CLEAR_NOIO:
>> +			current->flags &= ~PF_MEMALLOC_NOIO;
>> +			break;
>> +		case PR_MEMALLOC_SET_NOFS:
>> +			if (current->flags & PF_MEMALLOC_NOIO)
>> +				return -EINVAL;
>> +
>> +			current->flags |= PF_MEMALLOC_NOFS;
>> +			break;
>> +		case PR_MEMALLOC_CLEAR_NOFS:
>> +			current->flags &= ~PF_MEMALLOC_NOFS;
>> +			break;
>> +		default:
>> +			return -EINVAL;
>> +		}
>> +		break;
>> +	case PR_GET_MEMALLOC:
>> +		if (!capable(CAP_SYS_ADMIN))
>> +			return -EPERM;
>> +
>> +		if (arg2 || arg3 || arg4 || arg5)
>> +			return -EINVAL;
>> +
>> +		if (current->flags & PF_MEMALLOC_NOIO)
>> +			error = PR_MEMALLOC_SET_NOIO;
>> +		else if (current->flags & PF_MEMALLOC_NOFS)
>> +			error = PR_MEMALLOC_SET_NOFS;
>> +		else
>> +			error = 0;
>> +		break;
>>  	default:
>>  		error = -EINVAL;
>>  		break;
>> -- 
>> 2.20.1
>>
>
Michal Hocko Oct. 22, 2019, 4:33 p.m. UTC | #5
On Tue 22-10-19 11:13:20, Mike Christie wrote:
> On 10/22/2019 06:24 AM, Michal Hocko wrote:
> > On Mon 21-10-19 16:41:37, Mike Christie wrote:
> >> There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
> >> amd nbd that have userspace components that can run in the IO path. For
> >> example, iscsi and nbd's userspace deamons may need to recreate a socket
> >> and/or send IO on it, and dm-multipath's daemon multipathd may need to
> >> send IO to figure out the state of paths and re-set them up.
> >>
> >> In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
> >> memalloc_*_save/restore functions to control the allocation behavior,
> >> but for userspace we would end up hitting a allocation that ended up
> >> writing data back to the same device we are trying to allocate for.
> > 
> > Which code paths are we talking about here? Any ioctl or is this a
> > general syscall path? Can we mark the process in a more generic way?
> 
> It depends on the daemon. The common one for example are iscsi and nbd
> need network related calls like sendmsg, recvmsg, socket, etc.
> tcmu-runner could need the network ones and also read and write when it
> does IO to a FS or device. dm-multipath needs the sg io ioctls.

OK, so there is not a clear kernel entry point that could be explicitly
annotated. This would imply a per task context. This is an important
information. And I am wondering how those usecases ever worked in the
first place. This is not a minor detail.
 
> > E.g. we have PF_LESS_THROTTLE (used by nfsd). It doesn't affect the
> > reclaim recursion but it shows a pattern that doesn't really exhibit
> > too many internals. Maybe we need PF_IO_FLUSHER or similar?
> 
> I am not familiar with PF_IO_FLUSHER. If it prevents the recursion
> problem then please send me details and I will look into it for the next
> posting.

PF_IO_FLUSHER doesn't exist. I just wanted to point out that similarly
to PF_LESS_THROTTLE it should be a more high level per task flag rather
than something as low level as a direct control of gfp allocation
context. PF_LESS_THROTTLE simply tells that the task is a part of the
reclaim process and therefore it shouldn't be a subject of a normal
throttling - whatever that means. PF_IO_FLUSHER would mean that the user
context is a part of the IO path and therefore there are certain reclaim
recursion restrictions.
 
> >> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
> >> with prctl during their initialization so later allocations cannot
> >> calling back into them.
> > 
> > TBH I am not really happy to export these to the userspace. They are
> > an internal implementation detail and the userspace shouldn't really
> 
> They care in these cases, because block/fs drivers must be able to make
> forward progress during writes. To meet this guarantee kernel block
> drivers use mempools and memalloc/GFP flags.
> 
> For these userspace components of the block/fs drivers they already do
> things normal daemons do not to meet that guarantee like mlock their
> memory, disable oom killer, and preallocate resources they have control
> over. They have no control over reclaim like the kernel drivers do so
> its easy for us to deadlock when memory gets low.

OK, fair enough. How much of a control do they really need though. Is a
single PF_IO_FLUSHER as explained above (essentially imply GPF_NOIO
context) sufficient?
Dave Chinner Oct. 22, 2019, 8:43 p.m. UTC | #6
On Tue, Oct 22, 2019 at 06:33:10PM +0200, Michal Hocko wrote:
> On Tue 22-10-19 11:13:20, Mike Christie wrote:
> > On 10/22/2019 06:24 AM, Michal Hocko wrote:
> > > On Mon 21-10-19 16:41:37, Mike Christie wrote:
> > >> There are several storage drivers like dm-multipath, iscsi, tcmu-runner,
> > >> amd nbd that have userspace components that can run in the IO path. For
> > >> example, iscsi and nbd's userspace deamons may need to recreate a socket
> > >> and/or send IO on it, and dm-multipath's daemon multipathd may need to
> > >> send IO to figure out the state of paths and re-set them up.
> > >>
> > >> In the kernel these drivers have access to GFP_NOIO/GFP_NOFS and the
> > >> memalloc_*_save/restore functions to control the allocation behavior,
> > >> but for userspace we would end up hitting a allocation that ended up
> > >> writing data back to the same device we are trying to allocate for.
> > > 
> > > Which code paths are we talking about here? Any ioctl or is this a
> > > general syscall path? Can we mark the process in a more generic way?
> > 
> > It depends on the daemon. The common one for example are iscsi and nbd
> > need network related calls like sendmsg, recvmsg, socket, etc.
> > tcmu-runner could need the network ones and also read and write when it
> > does IO to a FS or device. dm-multipath needs the sg io ioctls.
> 
> OK, so there is not a clear kernel entry point that could be explicitly
> annotated. This would imply a per task context. This is an important
> information. And I am wondering how those usecases ever worked in the
> first place. This is not a minor detail.

They don't work, and we've known it for many years. It's just that
most of the time they are not run with really low memory. :)

e.g. loopback devices have long been known to deadlock like this
buts it's only very recently we gave it PF_MEMALLOC_NOIO protection
for it's kernel internal read() and write() calls.

> > > E.g. we have PF_LESS_THROTTLE (used by nfsd). It doesn't affect the
> > > reclaim recursion but it shows a pattern that doesn't really exhibit
> > > too many internals. Maybe we need PF_IO_FLUSHER or similar?
> > 
> > I am not familiar with PF_IO_FLUSHER. If it prevents the recursion
> > problem then please send me details and I will look into it for the next
> > posting.
> 
> PF_IO_FLUSHER doesn't exist. I just wanted to point out that similarly
> to PF_LESS_THROTTLE it should be a more high level per task flag rather
> than something as low level as a direct control of gfp allocation
> context. PF_LESS_THROTTLE simply tells that the task is a part of the
> reclaim process and therefore it shouldn't be a subject of a normal
> throttling - whatever that means.

PF_LESS_THROTTLE doesn't do that at all - it really only changes
limits slightly and doesn't prevent reclaim recursion deadlocks in
any way.

What PF_LESS_THROTTLE was largely intended for is give the process a
small amount of extra overhead on dirty page throttle limits so that
if it's a stacked device it won't get throttled before the upper
filesystem gets throttled.

i.e. the idea is that it's got a -little- bit  more wiggle room
before things like balance_dirty_pages() stops incoming writes,
hence allowing writes to the upper filesystems to be throttled first
before the underlying device that may be cleaning pages.

NFS uses this in the case of a loopback mount - both the client and
the server are on the same node, and so the data is double-cached.
FOr the client to clean it's page, the server has to be able to
write() the dirty data through balance_dirty_pages, and if we are at
the dirty page limit then it will block and we effectively deadlock
the writeback because we can't write the server side page that will
clean the client side page. So the NFS server is given a higher
dirty throttle limit by PF_LESS_THROTTLE so it can write when the
client is throttled.

The same problem exists for the loopback block device, and it also
sets PF_LESS_THROTTLE. But because it's a stacked block device
(unlike the NFS setup) it has actual filesystem memory reclaim
recursion problems (i.e. lower fs context allocation cleaning upper
fs pages recursing into upper fs to reclaim pages) and so it also
sets PF_MEMALLOC_NOIO to prevent these reclaim deadlocks.

IOWs, the situation with these userspace processes is akin to the
loopback device, not the "NFS client/NFS server on same host"
situation. We have lots of evidence of reclaim recursion hangs, but
very little evidence of balance_dirty_pages() throttling hangs....

> PF_IO_FLUSHER would mean that the user
> context is a part of the IO path and therefore there are certain reclaim
> recursion restrictions.

If PF_IO_FLUSHER just maps to PF_LESS_THROTTLE|PF_MEMALLOC_NOIO,
then I'm not sure we need a new definition. Maybe that's the ptrace
flag name, but in the kernel we don't need a PF_IO_FLUSHER process
flag...

> > >> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
> > >> with prctl during their initialization so later allocations cannot
> > >> calling back into them.
> > > 
> > > TBH I am not really happy to export these to the userspace. They are
> > > an internal implementation detail and the userspace shouldn't really
> > 
> > They care in these cases, because block/fs drivers must be able to make
> > forward progress during writes. To meet this guarantee kernel block
> > drivers use mempools and memalloc/GFP flags.
> > 
> > For these userspace components of the block/fs drivers they already do
> > things normal daemons do not to meet that guarantee like mlock their
> > memory, disable oom killer, and preallocate resources they have control
> > over. They have no control over reclaim like the kernel drivers do so
> > its easy for us to deadlock when memory gets low.
> 
> OK, fair enough. How much of a control do they really need though. Is a
> single PF_IO_FLUSHER as explained above (essentially imply GPF_NOIO
> context) sufficient?

I think some of these usrspace processes work at the filesystem
level and so really only need GFP_NOFS allocation (fuse), while
others work at the block device level (iscsi, nbd) so need GFP_NOIO
allocation. So there's definitely an argument for providing both...

Cheers,

Dave.
Michal Hocko Oct. 23, 2019, 7:11 a.m. UTC | #7
On Wed 23-10-19 07:43:44, Dave Chinner wrote:
> On Tue, Oct 22, 2019 at 06:33:10PM +0200, Michal Hocko wrote:

Thanks for more clarifiation regarding PF_LESS_THROTTLE.

[...]
> > PF_IO_FLUSHER would mean that the user
> > context is a part of the IO path and therefore there are certain reclaim
> > recursion restrictions.
> 
> If PF_IO_FLUSHER just maps to PF_LESS_THROTTLE|PF_MEMALLOC_NOIO,
> then I'm not sure we need a new definition. Maybe that's the ptrace
> flag name, but in the kernel we don't need a PF_IO_FLUSHER process
> flag...

Yes, the internal implementation would do something like that. I was
more interested in the user space visible API at this stage. Something
generic enough because exporting MEMALLOC flags is just a bad idea IMHO
(especially PF_MEMALLOC).

> > > >> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
> > > >> with prctl during their initialization so later allocations cannot
> > > >> calling back into them.
> > > > 
> > > > TBH I am not really happy to export these to the userspace. They are
> > > > an internal implementation detail and the userspace shouldn't really
> > > 
> > > They care in these cases, because block/fs drivers must be able to make
> > > forward progress during writes. To meet this guarantee kernel block
> > > drivers use mempools and memalloc/GFP flags.
> > > 
> > > For these userspace components of the block/fs drivers they already do
> > > things normal daemons do not to meet that guarantee like mlock their
> > > memory, disable oom killer, and preallocate resources they have control
> > > over. They have no control over reclaim like the kernel drivers do so
> > > its easy for us to deadlock when memory gets low.
> > 
> > OK, fair enough. How much of a control do they really need though. Is a
> > single PF_IO_FLUSHER as explained above (essentially imply GPF_NOIO
> > context) sufficient?
> 
> I think some of these usrspace processes work at the filesystem
> level and so really only need GFP_NOFS allocation (fuse), while
> others work at the block device level (iscsi, nbd) so need GFP_NOIO
> allocation. So there's definitely an argument for providing both...

The main question is whether giving more APIs is really necessary. Is
there any real problem to give them only PF_IO_FLUSHER and let both
groups use this one? It will imply more reclaim restrictions for solely
FS based ones but is this a practical problem? If yes we can always add
PF_FS_$FOO later on.
Mike Christie Oct. 23, 2019, 5:27 p.m. UTC | #8
On 10/23/2019 02:11 AM, Michal Hocko wrote:
> On Wed 23-10-19 07:43:44, Dave Chinner wrote:
>> On Tue, Oct 22, 2019 at 06:33:10PM +0200, Michal Hocko wrote:
> 
> Thanks for more clarifiation regarding PF_LESS_THROTTLE.
> 
> [...]
>>> PF_IO_FLUSHER would mean that the user
>>> context is a part of the IO path and therefore there are certain reclaim
>>> recursion restrictions.
>>
>> If PF_IO_FLUSHER just maps to PF_LESS_THROTTLE|PF_MEMALLOC_NOIO,
>> then I'm not sure we need a new definition. Maybe that's the ptrace
>> flag name, but in the kernel we don't need a PF_IO_FLUSHER process
>> flag...
> 
> Yes, the internal implementation would do something like that. I was
> more interested in the user space visible API at this stage. Something
> generic enough because exporting MEMALLOC flags is just a bad idea IMHO
> (especially PF_MEMALLOC).

Do you mean we would do something like:

prctl()
....
case PF_SET_IO_FLUSHER:
        current->flags |= PF_MEMALLOC_NOIO;
....

or are you saying we would add a new PF_IO_FLUSHER flag and then modify
PF_MEMALLOC_NOIO uses like in current_gfp_context:

if (current->flags & (PF_MEMALLOC_NOIO | PF_IO_FLUSHER)
      flags &= ~(__GFP_IO | __GFP_FS);

?

> 
>>>>>> This patch allows the userspace deamon to set the PF_MEMALLOC* flags
>>>>>> with prctl during their initialization so later allocations cannot
>>>>>> calling back into them.
>>>>>
>>>>> TBH I am not really happy to export these to the userspace. They are
>>>>> an internal implementation detail and the userspace shouldn't really
>>>>
>>>> They care in these cases, because block/fs drivers must be able to make
>>>> forward progress during writes. To meet this guarantee kernel block
>>>> drivers use mempools and memalloc/GFP flags.
>>>>
>>>> For these userspace components of the block/fs drivers they already do
>>>> things normal daemons do not to meet that guarantee like mlock their
>>>> memory, disable oom killer, and preallocate resources they have control
>>>> over. They have no control over reclaim like the kernel drivers do so
>>>> its easy for us to deadlock when memory gets low.
>>>
>>> OK, fair enough. How much of a control do they really need though. Is a
>>> single PF_IO_FLUSHER as explained above (essentially imply GPF_NOIO
>>> context) sufficient?
>>
>> I think some of these usrspace processes work at the filesystem
>> level and so really only need GFP_NOFS allocation (fuse), while
>> others work at the block device level (iscsi, nbd) so need GFP_NOIO
>> allocation. So there's definitely an argument for providing both...
> 
> The main question is whether giving more APIs is really necessary. Is
> there any real problem to give them only PF_IO_FLUSHER and let both
> groups use this one? It will imply more reclaim restrictions for solely
> FS based ones but is this a practical problem? If yes we can always add
> PF_FS_$FOO later on.


I am not sure. I will have to defer to general FS experts like Dave or
Martin and Damien for the specific fuse case. There do not seem to be a
lot of places where we check for __GFP_IO so configs with fuse and
bcache for example are probably not a big deal. However, I am not very
familiar with some of the other code paths in the mm layer and how FSs
interact with them.
Michal Hocko Oct. 23, 2019, 5:35 p.m. UTC | #9
On Wed 23-10-19 12:27:29, Mike Christie wrote:
> On 10/23/2019 02:11 AM, Michal Hocko wrote:
> > On Wed 23-10-19 07:43:44, Dave Chinner wrote:
> >> On Tue, Oct 22, 2019 at 06:33:10PM +0200, Michal Hocko wrote:
> > 
> > Thanks for more clarifiation regarding PF_LESS_THROTTLE.
> > 
> > [...]
> >>> PF_IO_FLUSHER would mean that the user
> >>> context is a part of the IO path and therefore there are certain reclaim
> >>> recursion restrictions.
> >>
> >> If PF_IO_FLUSHER just maps to PF_LESS_THROTTLE|PF_MEMALLOC_NOIO,
> >> then I'm not sure we need a new definition. Maybe that's the ptrace
> >> flag name, but in the kernel we don't need a PF_IO_FLUSHER process
> >> flag...
> > 
> > Yes, the internal implementation would do something like that. I was
> > more interested in the user space visible API at this stage. Something
> > generic enough because exporting MEMALLOC flags is just a bad idea IMHO
> > (especially PF_MEMALLOC).
> 
> Do you mean we would do something like:
> 
> prctl()
> ....
> case PF_SET_IO_FLUSHER:
>         current->flags |= PF_MEMALLOC_NOIO;
> ....

yes, along with PF_LESS_THROTTLE.
diff mbox series

Patch

diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 7da1b37b27aa..6f6b3af6633a 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -234,4 +234,12 @@  struct prctl_mm_map {
 #define PR_GET_TAGGED_ADDR_CTRL		56
 # define PR_TAGGED_ADDR_ENABLE		(1UL << 0)
 
+/* Control reclaim behavior when allocating memory */
+#define PR_SET_MEMALLOC			57
+#define PR_GET_MEMALLOC			58
+#define PR_MEMALLOC_SET_NOIO		(1UL << 0)
+#define PR_MEMALLOC_CLEAR_NOIO		(1UL << 1)
+#define PR_MEMALLOC_SET_NOFS		(1UL << 2)
+#define PR_MEMALLOC_CLEAR_NOFS		(1UL << 3)
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index a611d1d58c7d..34fedc9fc7e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2486,6 +2486,50 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			return -EINVAL;
 		error = GET_TAGGED_ADDR_CTRL();
 		break;
+	case PR_SET_MEMALLOC:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+
+		switch (arg2) {
+		case PR_MEMALLOC_SET_NOIO:
+			if (current->flags & PF_MEMALLOC_NOFS)
+				return -EINVAL;
+
+			current->flags |= PF_MEMALLOC_NOIO;
+			break;
+		case PR_MEMALLOC_CLEAR_NOIO:
+			current->flags &= ~PF_MEMALLOC_NOIO;
+			break;
+		case PR_MEMALLOC_SET_NOFS:
+			if (current->flags & PF_MEMALLOC_NOIO)
+				return -EINVAL;
+
+			current->flags |= PF_MEMALLOC_NOFS;
+			break;
+		case PR_MEMALLOC_CLEAR_NOFS:
+			current->flags &= ~PF_MEMALLOC_NOFS;
+			break;
+		default:
+			return -EINVAL;
+		}
+		break;
+	case PR_GET_MEMALLOC:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+
+		if (current->flags & PF_MEMALLOC_NOIO)
+			error = PR_MEMALLOC_SET_NOIO;
+		else if (current->flags & PF_MEMALLOC_NOFS)
+			error = PR_MEMALLOC_SET_NOFS;
+		else
+			error = 0;
+		break;
 	default:
 		error = -EINVAL;
 		break;