diff mbox series

[13/18] io_uring: add file set registration

Message ID 20190207195552.22770-14-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series [01/18] fs: add an iopoll method to struct file_operations | expand

Commit Message

Jens Axboe Feb. 7, 2019, 7:55 p.m. UTC
We normally have to fget/fput for each IO we do on a file. Even with
the batching we do, the cost of the atomic inc/dec of the file usage
count adds up.

This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes
for the io_uring_register(2) system call. The arguments passed in must
be an array of __s32 holding file descriptors, and nr_args should hold
the number of file descriptors the application wishes to pin for the
duration of the io_uring context (or until IORING_UNREGISTER_FILES is
called).

When used, the application must set IOSQE_FIXED_FILE in the sqe->flags
member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd
to the index in the array passed in to IORING_REGISTER_FILES.

Files are automatically unregistered when the io_uring context is
torn down. An application need only unregister if it wishes to
register a new set of fds.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 207 +++++++++++++++++++++++++++++-----
 include/net/af_unix.h         |   1 +
 include/uapi/linux/io_uring.h |   9 +-
 net/unix/af_unix.c            |   2 +-
 4 files changed, 188 insertions(+), 31 deletions(-)

Comments

Alan Jenkins Feb. 8, 2019, 12:17 p.m. UTC | #1
On 07/02/2019 19:55, Jens Axboe wrote:
> We normally have to fget/fput for each IO we do on a file. Even with
> the batching we do, the cost of the atomic inc/dec of the file usage
> count adds up.
>
> This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes
> for the io_uring_register(2) system call. The arguments passed in must
> be an array of __s32 holding file descriptors, and nr_args should hold
> the number of file descriptors the application wishes to pin for the
> duration of the io_uring context (or until IORING_UNREGISTER_FILES is
> called).
>
> When used, the application must set IOSQE_FIXED_FILE in the sqe->flags
> member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd
> to the index in the array passed in to IORING_REGISTER_FILES.
>
> Files are automatically unregistered when the io_uring context is
> torn down. An application need only unregister if it wishes to
> register a new set of fds.
>
> Signed-off-by: Jens Axboe<axboe@kernel.dk>
> ---
>   fs/io_uring.c                 | 207 +++++++++++++++++++++++++++++-----
>   include/net/af_unix.h         |   1 +
>   include/uapi/linux/io_uring.h |   9 +-
>   net/unix/af_unix.c            |   2 +-
>   4 files changed, 188 insertions(+), 31 deletions(-)
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index 9d6233dc35ca..f2550efec60d 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -29,6 +29,7 @@
>   #include <linux/net.h>
>   #include <net/sock.h>
>   #include <net/af_unix.h>
> +#include <net/scm.h>
>   #include <linux/anon_inodes.h>
>   #include <linux/sched/mm.h>
>   #include <linux/uaccess.h>
> @@ -101,6 +102,13 @@ struct io_ring_ctx {
>   		struct fasync_struct	*cq_fasync;
>   	} ____cacheline_aligned_in_smp;
>   
> +	/*
> +	 * If used, fixed file set. Writers must ensure that ->refs is dead,
> +	 * readers must ensure that ->refs is alive as long as the file* is
> +	 * used. Only updated through io_uring_register(2).
> +	 */
> +	struct scm_fp_list	*user_files;
> +
>   	/* if used, fixed mapped user buffers */
>   	unsigned		nr_user_bufs;
>   	struct io_mapped_ubuf	*user_bufs;
> @@ -148,6 +156,7 @@ struct io_kiocb {
>   	unsigned int		flags;
>   #define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
>   #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
> +#define REQ_F_FIXED_FILE	4	/* ctx owns file */
>   	u64			user_data;
>   	u64			error;
>   

> +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
> +{
> +#if defined(CONFIG_NET)
> +	if (ctx->ring_sock) {
> +		struct sock *sock = ctx->ring_sock->sk;
> +		struct sk_buff *skb;
> +
> +		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
> +			kfree_skb(skb);
> +	}
> +#else
> +	int i;
> +
> +	for (i = 0; i < ctx->user_files->count; i++)
> +		fput(ctx->user_files->fp[i]);
> +
> +	kfree(ctx->user_files);
> +#endif
> +}
> +
> +static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
> +{
> +	if (!ctx->user_files)
> +		return -ENXIO;
> +
> +	__io_sqe_files_unregister(ctx);
> +	ctx->user_files = NULL;
> +	return 0;
> +}
> +
> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
> +{
> +#if defined(CONFIG_NET)
> +	struct scm_fp_list *fpl = ctx->user_files;
> +	struct sk_buff *skb;
> +	int i;
> +
> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
> +	if (!skb)
> +		return -ENOMEM;
> +
> +	skb->sk = ctx->ring_sock->sk;
> +	skb->destructor = unix_destruct_scm;
> +
> +	fpl->user = get_uid(ctx->user);
> +	for (i = 0; i < fpl->count; i++) {
> +		get_file(fpl->fp[i]);
> +		unix_inflight(fpl->user, fpl->fp[i]);
> +		fput(fpl->fp[i]);
> +	}
> +
> +	UNIXCB(skb).fp = fpl;
> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);

This code sounds elegant if you know about the existence of unix_gc(), 
but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we 
have a brief comment, to comfort mortal readers on their journey?

/* A message on a unix socket can hold a reference to a file. This can 
cause a reference cycle. So there is a garbage collector for unix 
sockets, which we hook into here. */

I think this is bypassing too_many_unix_fds() though?  I understood that 
was intended to bound kernel memory allocation, at least in principle.

> +#endif

Also, this code relies on CONFIG_NET.  To handle the case where 
CONFIG_NET is not enabled, don't you still need to forbid registering an 
io_uring fd ?

> +	return 0;
> +}
> +
> +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
> +				 unsigned nr_args)
> +{
> +	__s32 __user *fds = (__s32 __user *) arg;
> +	struct scm_fp_list *fpl;
> +	int fd, ret = 0;
> +	unsigned i;
> +
> +	if (ctx->user_files)
> +		return -EBUSY;
> +	if (!nr_args || nr_args > SCM_MAX_FD)
> +		return -EINVAL;
> +
> +	fpl = kzalloc(sizeof(*ctx->user_files), GFP_KERNEL);
> +	if (!fpl)
> +		return -ENOMEM;
> +	fpl->max = nr_args;
> +
> +	for (i = 0; i < nr_args; i++) {
> +		ret = -EFAULT;
> +		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
> +			break;
> +
> +		fpl->fp[i] = fget(fd);
> +
> +		ret = -EBADF;
> +		if (!fpl->fp[i])
> +			break;
> +		fpl->count++;
> +		ret = 0;
> +	}
> +
> +	ctx->user_files = fpl;
> +	if (!ret)
> +		ret = io_sqe_files_scm(ctx);
> +	if (ret)
> +		io_sqe_files_unregister(ctx);
> +
> +	return ret;
> +}
> +
>   static int io_sq_offload_start(struct io_ring_ctx *ctx)
>   {
>   	int ret;
> @@ -1520,14 +1658,16 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
>   		destroy_workqueue(ctx->sqo_wq);
>   	if (ctx->sqo_mm)
>   		mmdrop(ctx->sqo_mm);
> +
> +	io_iopoll_reap_events(ctx);
> +	io_sqe_buffer_unregister(ctx);
> +	io_sqe_files_unregister(ctx);
> +
>   #if defined(CONFIG_NET)
>   	if (ctx->ring_sock)
>   		sock_release(ctx->ring_sock);
>   #endif
>   
> -	io_iopoll_reap_events(ctx);
> -	io_sqe_buffer_unregister(ctx);
> -
>   	io_mem_free(ctx->sq_ring);
>   	io_mem_free(ctx->sq_sqes);
>   	io_mem_free(ctx->cq_ring);
> @@ -1885,6 +2025,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
>   			break;
>   		ret = io_sqe_buffer_unregister(ctx);
>   		break;
> +	case IORING_REGISTER_FILES:
> +		ret = io_sqe_files_register(ctx, arg, nr_args);
> +		break;
> +	case IORING_UNREGISTER_FILES:
> +		ret = -EINVAL;
> +		if (arg || nr_args)
> +			break;
> +		ret = io_sqe_files_unregister(ctx);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
Jens Axboe Feb. 8, 2019, 12:57 p.m. UTC | #2
On 2/8/19 5:17 AM, Alan Jenkins wrote:
>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>> +{
>> +#if defined(CONFIG_NET)
>> +	struct scm_fp_list *fpl = ctx->user_files;
>> +	struct sk_buff *skb;
>> +	int i;
>> +
>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>> +	if (!skb)
>> +		return -ENOMEM;
>> +
>> +	skb->sk = ctx->ring_sock->sk;
>> +	skb->destructor = unix_destruct_scm;
>> +
>> +	fpl->user = get_uid(ctx->user);
>> +	for (i = 0; i < fpl->count; i++) {
>> +		get_file(fpl->fp[i]);
>> +		unix_inflight(fpl->user, fpl->fp[i]);
>> +		fput(fpl->fp[i]);
>> +	}
>> +
>> +	UNIXCB(skb).fp = fpl;
>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
> 
> This code sounds elegant if you know about the existence of unix_gc(), 
> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we 
> have a brief comment, to comfort mortal readers on their journey?
> 
> /* A message on a unix socket can hold a reference to a file. This can 
> cause a reference cycle. So there is a garbage collector for unix 
> sockets, which we hook into here. */

Yes that's a good idea, I've added a comment as to why we go through the
trouble of doing this socket + skb dance.

> I think this is bypassing too_many_unix_fds() though?  I understood that 
> was intended to bound kernel memory allocation, at least in principle.

As the code stands above, it'll cap it at 253. I'm just now reworking it
to NOT be limited to the SCM max fd count, but still impose a limit of
1024 on the number of registered files. This is important to cap the
memory allocation attempt as well.

> Also, this code relies on CONFIG_NET.  To handle the case where 
> CONFIG_NET is not enabled, don't you still need to forbid registering an 
> io_uring fd ?

Good point, we do still need to reject the io_uring fd itself if
CONFIG_UNIX is not enabled. Done.
Alan Jenkins Feb. 8, 2019, 2:02 p.m. UTC | #3
On 08/02/2019 12:57, Jens Axboe wrote:
> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>> +{
>>> +#if defined(CONFIG_NET)
>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>> +	struct sk_buff *skb;
>>> +	int i;
>>> +
>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>> +	if (!skb)
>>> +		return -ENOMEM;
>>> +
>>> +	skb->sk = ctx->ring_sock->sk;
>>> +	skb->destructor = unix_destruct_scm;
>>> +
>>> +	fpl->user = get_uid(ctx->user);
>>> +	for (i = 0; i < fpl->count; i++) {
>>> +		get_file(fpl->fp[i]);
>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>> +		fput(fpl->fp[i]);
>>> +	}
>>> +
>>> +	UNIXCB(skb).fp = fpl;
>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>> This code sounds elegant if you know about the existence of unix_gc(),
>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>> have a brief comment, to comfort mortal readers on their journey?
>>
>> /* A message on a unix socket can hold a reference to a file. This can
>> cause a reference cycle. So there is a garbage collector for unix
>> sockets, which we hook into here. */
> Yes that's a good idea, I've added a comment as to why we go through the
> trouble of doing this socket + skb dance.

Great, thanks.

>> I think this is bypassing too_many_unix_fds() though?  I understood that
>> was intended to bound kernel memory allocation, at least in principle.
> As the code stands above, it'll cap it at 253. I'm just now reworking it
> to NOT be limited to the SCM max fd count, but still impose a limit of
> 1024 on the number of registered files. This is important to cap the
> memory allocation attempt as well.

I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand, 
there's no specific limit on the number of io_urings you can open (only 
the standard limits on fds).  So this would let you allocate hundreds of 
times more files than the previous limit RLIMIT_NOFILE...

static inline bool too_many_unix_fds(struct task_struct *p)
{
	struct user_struct *user = current_user();

	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
	return false;
}

RLIMIT_NOFILE is technically per-task, but here it is capping 
unix_inflight per-user.  So the way I look at this, the number of file 
descriptors per user is bounded by NOFILE * NPROC.  Then 
user->unix_inflight can have one additional process' worth (NOFILE) of 
"inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only 
called once per SCM_RIGHTS).

Because io_uring doesn't check too_many_unix_fds(), I think it will let 
you have about 253 (or 1024) more process' worth of open files. That 
could be big proportionally when RLIMIT_NPROC is low.

I don't know if it matters.  It maybe reads like an oversight though.

(If it does matter, it might be cleanest to change too_many_unix_fds() 
to get rid of the "slop".  Since that may be different between af_unix 
and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the 
number of inflight files we want to add.)

>> Also, this code relies on CONFIG_NET.  To handle the case where
>> CONFIG_NET is not enabled, don't you still need to forbid registering an
>> io_uring fd ?
> Good point, we do still need to reject the io_uring fd itself if
> CONFIG_UNIX is not enabled. Done.
Jens Axboe Feb. 8, 2019, 3:13 p.m. UTC | #4
On 2/8/19 7:02 AM, Alan Jenkins wrote:
> On 08/02/2019 12:57, Jens Axboe wrote:
>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>> +{
>>>> +#if defined(CONFIG_NET)
>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>> +	struct sk_buff *skb;
>>>> +	int i;
>>>> +
>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>> +	if (!skb)
>>>> +		return -ENOMEM;
>>>> +
>>>> +	skb->sk = ctx->ring_sock->sk;
>>>> +	skb->destructor = unix_destruct_scm;
>>>> +
>>>> +	fpl->user = get_uid(ctx->user);
>>>> +	for (i = 0; i < fpl->count; i++) {
>>>> +		get_file(fpl->fp[i]);
>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>> +		fput(fpl->fp[i]);
>>>> +	}
>>>> +
>>>> +	UNIXCB(skb).fp = fpl;
>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>> This code sounds elegant if you know about the existence of unix_gc(),
>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>> have a brief comment, to comfort mortal readers on their journey?
>>>
>>> /* A message on a unix socket can hold a reference to a file. This can
>>> cause a reference cycle. So there is a garbage collector for unix
>>> sockets, which we hook into here. */
>> Yes that's a good idea, I've added a comment as to why we go through the
>> trouble of doing this socket + skb dance.
> 
> Great, thanks.
> 
>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>> was intended to bound kernel memory allocation, at least in principle.
>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>> to NOT be limited to the SCM max fd count, but still impose a limit of
>> 1024 on the number of registered files. This is important to cap the
>> memory allocation attempt as well.
> 
> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand, 
> there's no specific limit on the number of io_urings you can open (only 
> the standard limits on fds).  So this would let you allocate hundreds of 
> times more files than the previous limit RLIMIT_NOFILE...

But there is, the io_uring itself is under the memlock rlimit.

> static inline bool too_many_unix_fds(struct task_struct *p)
> {
> 	struct user_struct *user = current_user();
> 
> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
> 	return false;
> }
> 
> RLIMIT_NOFILE is technically per-task, but here it is capping 
> unix_inflight per-user.  So the way I look at this, the number of file 
> descriptors per user is bounded by NOFILE * NPROC.  Then 
> user->unix_inflight can have one additional process' worth (NOFILE) of 
> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only 
> called once per SCM_RIGHTS).
> 
> Because io_uring doesn't check too_many_unix_fds(), I think it will let 
> you have about 253 (or 1024) more process' worth of open files. That 
> could be big proportionally when RLIMIT_NPROC is low.
> 
> I don't know if it matters.  It maybe reads like an oversight though.
> 
> (If it does matter, it might be cleanest to change too_many_unix_fds() 
> to get rid of the "slop".  Since that may be different between af_unix 
> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the 
> number of inflight files we want to add.)

I don't think it matters. The files in the fixed file set have already
been opened by the application, so it counts towards the number of open
files that is allowed to have. I don't think we should impose further
limits on top of that.
Alan Jenkins Feb. 12, 2019, 12:29 p.m. UTC | #5
On 08/02/2019 15:13, Jens Axboe wrote:
> On 2/8/19 7:02 AM, Alan Jenkins wrote:
>> On 08/02/2019 12:57, Jens Axboe wrote:
>>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>>> +{
>>>>> +#if defined(CONFIG_NET)
>>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>>> +	struct sk_buff *skb;
>>>>> +	int i;
>>>>> +
>>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>>> +	if (!skb)
>>>>> +		return -ENOMEM;
>>>>> +
>>>>> +	skb->sk = ctx->ring_sock->sk;
>>>>> +	skb->destructor = unix_destruct_scm;
>>>>> +
>>>>> +	fpl->user = get_uid(ctx->user);
>>>>> +	for (i = 0; i < fpl->count; i++) {
>>>>> +		get_file(fpl->fp[i]);
>>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>>> +		fput(fpl->fp[i]);
>>>>> +	}
>>>>> +
>>>>> +	UNIXCB(skb).fp = fpl;
>>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>>> This code sounds elegant if you know about the existence of unix_gc(),
>>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>>> have a brief comment, to comfort mortal readers on their journey?
>>>>
>>>> /* A message on a unix socket can hold a reference to a file. This can
>>>> cause a reference cycle. So there is a garbage collector for unix
>>>> sockets, which we hook into here. */
>>> Yes that's a good idea, I've added a comment as to why we go through the
>>> trouble of doing this socket + skb dance.
>> Great, thanks.
>>
>>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>>> was intended to bound kernel memory allocation, at least in principle.
>>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>>> to NOT be limited to the SCM max fd count, but still impose a limit of
>>> 1024 on the number of registered files. This is important to cap the
>>> memory allocation attempt as well.
>> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand,
>> there's no specific limit on the number of io_urings you can open (only
>> the standard limits on fds).  So this would let you allocate hundreds of
>> times more files than the previous limit RLIMIT_NOFILE...
> But there is, the io_uring itself is under the memlock rlimit.
>
>> static inline bool too_many_unix_fds(struct task_struct *p)
>> {
>> 	struct user_struct *user = current_user();
>>
>> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
>> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
>> 	return false;
>> }
>>
>> RLIMIT_NOFILE is technically per-task, but here it is capping
>> unix_inflight per-user.  So the way I look at this, the number of file
>> descriptors per user is bounded by NOFILE * NPROC.  Then
>> user->unix_inflight can have one additional process' worth (NOFILE) of
>> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only
>> called once per SCM_RIGHTS).
>>
>> Because io_uring doesn't check too_many_unix_fds(), I think it will let
>> you have about 253 (or 1024) more process' worth of open files. That
>> could be big proportionally when RLIMIT_NPROC is low.
>>
>> I don't know if it matters.  It maybe reads like an oversight though.
>>
>> (If it does matter, it might be cleanest to change too_many_unix_fds()
>> to get rid of the "slop".  Since that may be different between af_unix
>> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the
>> number of inflight files we want to add.)
> I don't think it matters. The files in the fixed file set have already
> been opened by the application, so it counts towards the number of open
> files that is allowed to have. I don't think we should impose further
> limits on top of that.

A process can open one io_uring and 199 other files.  Register the 199 
files in the io_uring, then close their file descriptors.  The main 
NOFILE limit only counts file descriptors.  So then you can open one 
io_uring, 198 other files, and repeat.

You're right, I had forgotten the memlock limit on io_uring.  That makes 
it much less of a practical problem.

But it raises a second point.  It's not just that it lets users allocate 
more files.  You might not want to be limited by user->unix_inflight.  
But you are calling unix_inflight(), which increments it!  Then if 
unix->inflight exceeds the NOFILE limit, you will avoid seeing any 
errors with io_uring, but the user will not be able to send files over 
unix sockets.

So I think this is confusing to read, and confusing to troubleshoot if 
the limit is ever hit.

I would be happy if io_uring didn't increment user->unix_inflight.  I'm 
not sure what the best way is to arrange that.

Regards
Alan
Jens Axboe Feb. 12, 2019, 3:17 p.m. UTC | #6
On 2/12/19 5:29 AM, Alan Jenkins wrote:
> On 08/02/2019 15:13, Jens Axboe wrote:
>> On 2/8/19 7:02 AM, Alan Jenkins wrote:
>>> On 08/02/2019 12:57, Jens Axboe wrote:
>>>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>>>> +{
>>>>>> +#if defined(CONFIG_NET)
>>>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>>>> +	struct sk_buff *skb;
>>>>>> +	int i;
>>>>>> +
>>>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>>>> +	if (!skb)
>>>>>> +		return -ENOMEM;
>>>>>> +
>>>>>> +	skb->sk = ctx->ring_sock->sk;
>>>>>> +	skb->destructor = unix_destruct_scm;
>>>>>> +
>>>>>> +	fpl->user = get_uid(ctx->user);
>>>>>> +	for (i = 0; i < fpl->count; i++) {
>>>>>> +		get_file(fpl->fp[i]);
>>>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>>>> +		fput(fpl->fp[i]);
>>>>>> +	}
>>>>>> +
>>>>>> +	UNIXCB(skb).fp = fpl;
>>>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>>>> This code sounds elegant if you know about the existence of unix_gc(),
>>>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>>>> have a brief comment, to comfort mortal readers on their journey?
>>>>>
>>>>> /* A message on a unix socket can hold a reference to a file. This can
>>>>> cause a reference cycle. So there is a garbage collector for unix
>>>>> sockets, which we hook into here. */
>>>> Yes that's a good idea, I've added a comment as to why we go through the
>>>> trouble of doing this socket + skb dance.
>>> Great, thanks.
>>>
>>>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>>>> was intended to bound kernel memory allocation, at least in principle.
>>>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>>>> to NOT be limited to the SCM max fd count, but still impose a limit of
>>>> 1024 on the number of registered files. This is important to cap the
>>>> memory allocation attempt as well.
>>> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand,
>>> there's no specific limit on the number of io_urings you can open (only
>>> the standard limits on fds).  So this would let you allocate hundreds of
>>> times more files than the previous limit RLIMIT_NOFILE...
>> But there is, the io_uring itself is under the memlock rlimit.
>>
>>> static inline bool too_many_unix_fds(struct task_struct *p)
>>> {
>>> 	struct user_struct *user = current_user();
>>>
>>> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
>>> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
>>> 	return false;
>>> }
>>>
>>> RLIMIT_NOFILE is technically per-task, but here it is capping
>>> unix_inflight per-user.  So the way I look at this, the number of file
>>> descriptors per user is bounded by NOFILE * NPROC.  Then
>>> user->unix_inflight can have one additional process' worth (NOFILE) of
>>> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only
>>> called once per SCM_RIGHTS).
>>>
>>> Because io_uring doesn't check too_many_unix_fds(), I think it will let
>>> you have about 253 (or 1024) more process' worth of open files. That
>>> could be big proportionally when RLIMIT_NPROC is low.
>>>
>>> I don't know if it matters.  It maybe reads like an oversight though.
>>>
>>> (If it does matter, it might be cleanest to change too_many_unix_fds()
>>> to get rid of the "slop".  Since that may be different between af_unix
>>> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the
>>> number of inflight files we want to add.)
>> I don't think it matters. The files in the fixed file set have already
>> been opened by the application, so it counts towards the number of open
>> files that is allowed to have. I don't think we should impose further
>> limits on top of that.
> 
> A process can open one io_uring and 199 other files.  Register the 199 
> files in the io_uring, then close their file descriptors.  The main 
> NOFILE limit only counts file descriptors.  So then you can open one 
> io_uring, 198 other files, and repeat.
> 
> You're right, I had forgotten the memlock limit on io_uring.  That makes 
> it much less of a practical problem.
> 
> But it raises a second point.  It's not just that it lets users allocate 
> more files.  You might not want to be limited by user->unix_inflight.  
> But you are calling unix_inflight(), which increments it!  Then if 
> unix->inflight exceeds the NOFILE limit, you will avoid seeing any 
> errors with io_uring, but the user will not be able to send files over 
> unix sockets.
> 
> So I think this is confusing to read, and confusing to troubleshoot if 
> the limit is ever hit.
> 
> I would be happy if io_uring didn't increment user->unix_inflight.  I'm 
> not sure what the best way is to arrange that.

How about we just do something like the below? I think that's the saner
approach, rather than bypass user->unix_inflight. It's literally the
same thing.


diff --git a/fs/io_uring.c b/fs/io_uring.c
index a4973af1c272..5196b3aa935e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2041,6 +2041,13 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
 	struct sk_buff *skb;
 	int i;
 
+	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+		struct user_struct *user = ctx->user;
+
+		if (user->unix_inflight > task_rlimit(current, RLIMIT_NOFILE))
+			return -EMFILE;
+	}
+
 	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
 	if (!fpl)
 		return -ENOMEM;
Alan Jenkins Feb. 12, 2019, 5:21 p.m. UTC | #7
On 12/02/2019 15:17, Jens Axboe wrote:
> On 2/12/19 5:29 AM, Alan Jenkins wrote:
>> On 08/02/2019 15:13, Jens Axboe wrote:
>>> On 2/8/19 7:02 AM, Alan Jenkins wrote:
>>>> On 08/02/2019 12:57, Jens Axboe wrote:
>>>>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>>>>> +{
>>>>>>> +#if defined(CONFIG_NET)
>>>>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>>>>> +	struct sk_buff *skb;
>>>>>>> +	int i;
>>>>>>> +
>>>>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>>>>> +	if (!skb)
>>>>>>> +		return -ENOMEM;
>>>>>>> +
>>>>>>> +	skb->sk = ctx->ring_sock->sk;
>>>>>>> +	skb->destructor = unix_destruct_scm;
>>>>>>> +
>>>>>>> +	fpl->user = get_uid(ctx->user);
>>>>>>> +	for (i = 0; i < fpl->count; i++) {
>>>>>>> +		get_file(fpl->fp[i]);
>>>>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>>>>> +		fput(fpl->fp[i]);
>>>>>>> +	}
>>>>>>> +
>>>>>>> +	UNIXCB(skb).fp = fpl;
>>>>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>>>>> This code sounds elegant if you know about the existence of unix_gc(),
>>>>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>>>>> have a brief comment, to comfort mortal readers on their journey?
>>>>>>
>>>>>> /* A message on a unix socket can hold a reference to a file. This can
>>>>>> cause a reference cycle. So there is a garbage collector for unix
>>>>>> sockets, which we hook into here. */
>>>>> Yes that's a good idea, I've added a comment as to why we go through the
>>>>> trouble of doing this socket + skb dance.
>>>> Great, thanks.
>>>>
>>>>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>>>>> was intended to bound kernel memory allocation, at least in principle.
>>>>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>>>>> to NOT be limited to the SCM max fd count, but still impose a limit of
>>>>> 1024 on the number of registered files. This is important to cap the
>>>>> memory allocation attempt as well.
>>>> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand,
>>>> there's no specific limit on the number of io_urings you can open (only
>>>> the standard limits on fds).  So this would let you allocate hundreds of
>>>> times more files than the previous limit RLIMIT_NOFILE...
>>> But there is, the io_uring itself is under the memlock rlimit.
>>>
>>>> static inline bool too_many_unix_fds(struct task_struct *p)
>>>> {
>>>> 	struct user_struct *user = current_user();
>>>>
>>>> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
>>>> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
>>>> 	return false;
>>>> }
>>>>
>>>> RLIMIT_NOFILE is technically per-task, but here it is capping
>>>> unix_inflight per-user.  So the way I look at this, the number of file
>>>> descriptors per user is bounded by NOFILE * NPROC.  Then
>>>> user->unix_inflight can have one additional process' worth (NOFILE) of
>>>> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only
>>>> called once per SCM_RIGHTS).
>>>>
>>>> Because io_uring doesn't check too_many_unix_fds(), I think it will let
>>>> you have about 253 (or 1024) more process' worth of open files. That
>>>> could be big proportionally when RLIMIT_NPROC is low.
>>>>
>>>> I don't know if it matters.  It maybe reads like an oversight though.
>>>>
>>>> (If it does matter, it might be cleanest to change too_many_unix_fds()
>>>> to get rid of the "slop".  Since that may be different between af_unix
>>>> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the
>>>> number of inflight files we want to add.)
>>> I don't think it matters. The files in the fixed file set have already
>>> been opened by the application, so it counts towards the number of open
>>> files that is allowed to have. I don't think we should impose further
>>> limits on top of that.
>> A process can open one io_uring and 199 other files.  Register the 199
>> files in the io_uring, then close their file descriptors.  The main
>> NOFILE limit only counts file descriptors.  So then you can open one
>> io_uring, 198 other files, and repeat.
>>
>> You're right, I had forgotten the memlock limit on io_uring.  That makes
>> it much less of a practical problem.
>>
>> But it raises a second point.  It's not just that it lets users allocate
>> more files.  You might not want to be limited by user->unix_inflight.
>> But you are calling unix_inflight(), which increments it!  Then if
>> unix->inflight exceeds the NOFILE limit, you will avoid seeing any
>> errors with io_uring, but the user will not be able to send files over
>> unix sockets.
>>
>> So I think this is confusing to read, and confusing to troubleshoot if
>> the limit is ever hit.
>>
>> I would be happy if io_uring didn't increment user->unix_inflight.  I'm
>> not sure what the best way is to arrange that.
> How about we just do something like the below? I think that's the saner
> approach, rather than bypass user->unix_inflight. It's literally the
> same thing.
>
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index a4973af1c272..5196b3aa935e 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -2041,6 +2041,13 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
>   	struct sk_buff *skb;
>   	int i;
>   
> +	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
> +		struct user_struct *user = ctx->user;
> +
> +		if (user->unix_inflight > task_rlimit(current, RLIMIT_NOFILE))
> +			return -EMFILE;
> +	}
> +
>   	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
>   	if (!fpl)
>   		return -ENOMEM;
>
>

Welp, you gave me exactly what I asked for.  So now I'd better be 
positive about it :-D.

I hope this will be documented accurately, at least where the EMFILE 
result is explained for this syscall.

Because EMFILE is different from the errno in af_unix.c, I will add a 
wish for the existing documentation of ETOOMANYREFS in unix(7) to 
reference this.

I'll stop bikeshedding there.  EMFILE sounds ok.  strerror() calls 
ETOOMANYREFS "Too many references: cannot splice"; it doesn't seem to be 
particularly helpful or well-known.

Thanks
Alan
Jens Axboe Feb. 12, 2019, 5:33 p.m. UTC | #8
On 2/12/19 10:21 AM, Alan Jenkins wrote:
> On 12/02/2019 15:17, Jens Axboe wrote:
>> On 2/12/19 5:29 AM, Alan Jenkins wrote:
>>> On 08/02/2019 15:13, Jens Axboe wrote:
>>>> On 2/8/19 7:02 AM, Alan Jenkins wrote:
>>>>> On 08/02/2019 12:57, Jens Axboe wrote:
>>>>>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>>>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>>>>>> +{
>>>>>>>> +#if defined(CONFIG_NET)
>>>>>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>>>>>> +	struct sk_buff *skb;
>>>>>>>> +	int i;
>>>>>>>> +
>>>>>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>>>>>> +	if (!skb)
>>>>>>>> +		return -ENOMEM;
>>>>>>>> +
>>>>>>>> +	skb->sk = ctx->ring_sock->sk;
>>>>>>>> +	skb->destructor = unix_destruct_scm;
>>>>>>>> +
>>>>>>>> +	fpl->user = get_uid(ctx->user);
>>>>>>>> +	for (i = 0; i < fpl->count; i++) {
>>>>>>>> +		get_file(fpl->fp[i]);
>>>>>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>>>>>> +		fput(fpl->fp[i]);
>>>>>>>> +	}
>>>>>>>> +
>>>>>>>> +	UNIXCB(skb).fp = fpl;
>>>>>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>>>>>> This code sounds elegant if you know about the existence of unix_gc(),
>>>>>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>>>>>> have a brief comment, to comfort mortal readers on their journey?
>>>>>>>
>>>>>>> /* A message on a unix socket can hold a reference to a file. This can
>>>>>>> cause a reference cycle. So there is a garbage collector for unix
>>>>>>> sockets, which we hook into here. */
>>>>>> Yes that's a good idea, I've added a comment as to why we go through the
>>>>>> trouble of doing this socket + skb dance.
>>>>> Great, thanks.
>>>>>
>>>>>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>>>>>> was intended to bound kernel memory allocation, at least in principle.
>>>>>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>>>>>> to NOT be limited to the SCM max fd count, but still impose a limit of
>>>>>> 1024 on the number of registered files. This is important to cap the
>>>>>> memory allocation attempt as well.
>>>>> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand,
>>>>> there's no specific limit on the number of io_urings you can open (only
>>>>> the standard limits on fds).  So this would let you allocate hundreds of
>>>>> times more files than the previous limit RLIMIT_NOFILE...
>>>> But there is, the io_uring itself is under the memlock rlimit.
>>>>
>>>>> static inline bool too_many_unix_fds(struct task_struct *p)
>>>>> {
>>>>> 	struct user_struct *user = current_user();
>>>>>
>>>>> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
>>>>> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
>>>>> 	return false;
>>>>> }
>>>>>
>>>>> RLIMIT_NOFILE is technically per-task, but here it is capping
>>>>> unix_inflight per-user.  So the way I look at this, the number of file
>>>>> descriptors per user is bounded by NOFILE * NPROC.  Then
>>>>> user->unix_inflight can have one additional process' worth (NOFILE) of
>>>>> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only
>>>>> called once per SCM_RIGHTS).
>>>>>
>>>>> Because io_uring doesn't check too_many_unix_fds(), I think it will let
>>>>> you have about 253 (or 1024) more process' worth of open files. That
>>>>> could be big proportionally when RLIMIT_NPROC is low.
>>>>>
>>>>> I don't know if it matters.  It maybe reads like an oversight though.
>>>>>
>>>>> (If it does matter, it might be cleanest to change too_many_unix_fds()
>>>>> to get rid of the "slop".  Since that may be different between af_unix
>>>>> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the
>>>>> number of inflight files we want to add.)
>>>> I don't think it matters. The files in the fixed file set have already
>>>> been opened by the application, so it counts towards the number of open
>>>> files that is allowed to have. I don't think we should impose further
>>>> limits on top of that.
>>> A process can open one io_uring and 199 other files.  Register the 199
>>> files in the io_uring, then close their file descriptors.  The main
>>> NOFILE limit only counts file descriptors.  So then you can open one
>>> io_uring, 198 other files, and repeat.
>>>
>>> You're right, I had forgotten the memlock limit on io_uring.  That makes
>>> it much less of a practical problem.
>>>
>>> But it raises a second point.  It's not just that it lets users allocate
>>> more files.  You might not want to be limited by user->unix_inflight.
>>> But you are calling unix_inflight(), which increments it!  Then if
>>> unix->inflight exceeds the NOFILE limit, you will avoid seeing any
>>> errors with io_uring, but the user will not be able to send files over
>>> unix sockets.
>>>
>>> So I think this is confusing to read, and confusing to troubleshoot if
>>> the limit is ever hit.
>>>
>>> I would be happy if io_uring didn't increment user->unix_inflight.  I'm
>>> not sure what the best way is to arrange that.
>> How about we just do something like the below? I think that's the saner
>> approach, rather than bypass user->unix_inflight. It's literally the
>> same thing.
>>
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index a4973af1c272..5196b3aa935e 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -2041,6 +2041,13 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
>>   	struct sk_buff *skb;
>>   	int i;
>>   
>> +	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
>> +		struct user_struct *user = ctx->user;
>> +
>> +		if (user->unix_inflight > task_rlimit(current, RLIMIT_NOFILE))
>> +			return -EMFILE;
>> +	}
>> +
>>   	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
>>   	if (!fpl)
>>   		return -ENOMEM;
>>
>>
> 
> Welp, you gave me exactly what I asked for.  So now I'd better be 
> positive about it :-D.

;-)

> I hope this will be documented accurately, at least where the EMFILE 
> result is explained for this syscall.

How's this:

http://git.kernel.dk/cgit/liburing/commit/?id=37e48698a09aa1e37690f8fa6dfd8da69a48ee60

> Because EMFILE is different from the errno in af_unix.c, I will add a 
> wish for the existing documentation of ETOOMANYREFS in unix(7) to 
> reference this.
> 
> I'll stop bikeshedding there.  EMFILE sounds ok.  strerror() calls 
> ETOOMANYREFS "Too many references: cannot splice"; it doesn't seem to be 
> particularly helpful or well-known.

Agree
Alan Jenkins Feb. 12, 2019, 8:23 p.m. UTC | #9
On 12/02/2019 17:33, Jens Axboe wrote:
> On 2/12/19 10:21 AM, Alan Jenkins wrote:
>> On 12/02/2019 15:17, Jens Axboe wrote:
>>> On 2/12/19 5:29 AM, Alan Jenkins wrote:
>>>> On 08/02/2019 15:13, Jens Axboe wrote:
>>>>> On 2/8/19 7:02 AM, Alan Jenkins wrote:
>>>>>> On 08/02/2019 12:57, Jens Axboe wrote:
>>>>>>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>>>>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>>>>>>> +{
>>>>>>>>> +#if defined(CONFIG_NET)
>>>>>>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>>>>>>> +	struct sk_buff *skb;
>>>>>>>>> +	int i;
>>>>>>>>> +
>>>>>>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>>>>>>> +	if (!skb)
>>>>>>>>> +		return -ENOMEM;
>>>>>>>>> +
>>>>>>>>> +	skb->sk = ctx->ring_sock->sk;
>>>>>>>>> +	skb->destructor = unix_destruct_scm;
>>>>>>>>> +
>>>>>>>>> +	fpl->user = get_uid(ctx->user);
>>>>>>>>> +	for (i = 0; i < fpl->count; i++) {
>>>>>>>>> +		get_file(fpl->fp[i]);
>>>>>>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>>>>>>> +		fput(fpl->fp[i]);
>>>>>>>>> +	}
>>>>>>>>> +
>>>>>>>>> +	UNIXCB(skb).fp = fpl;
>>>>>>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>>>>>>> This code sounds elegant if you know about the existence of unix_gc(),
>>>>>>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>>>>>>> have a brief comment, to comfort mortal readers on their journey?
>>>>>>>>
>>>>>>>> /* A message on a unix socket can hold a reference to a file. This can
>>>>>>>> cause a reference cycle. So there is a garbage collector for unix
>>>>>>>> sockets, which we hook into here. */
>>>>>>> Yes that's a good idea, I've added a comment as to why we go through the
>>>>>>> trouble of doing this socket + skb dance.
>>>>>> Great, thanks.
>>>>>>
>>>>>>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>>>>>>> was intended to bound kernel memory allocation, at least in principle.
>>>>>>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>>>>>>> to NOT be limited to the SCM max fd count, but still impose a limit of
>>>>>>> 1024 on the number of registered files. This is important to cap the
>>>>>>> memory allocation attempt as well.
>>>>>> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand,
>>>>>> there's no specific limit on the number of io_urings you can open (only
>>>>>> the standard limits on fds).  So this would let you allocate hundreds of
>>>>>> times more files than the previous limit RLIMIT_NOFILE...
>>>>> But there is, the io_uring itself is under the memlock rlimit.
>>>>>
>>>>>> static inline bool too_many_unix_fds(struct task_struct *p)
>>>>>> {
>>>>>> 	struct user_struct *user = current_user();
>>>>>>
>>>>>> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
>>>>>> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
>>>>>> 	return false;
>>>>>> }
>>>>>>
>>>>>> RLIMIT_NOFILE is technically per-task, but here it is capping
>>>>>> unix_inflight per-user.  So the way I look at this, the number of file
>>>>>> descriptors per user is bounded by NOFILE * NPROC.  Then
>>>>>> user->unix_inflight can have one additional process' worth (NOFILE) of
>>>>>> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only
>>>>>> called once per SCM_RIGHTS).
>>>>>>
>>>>>> Because io_uring doesn't check too_many_unix_fds(), I think it will let
>>>>>> you have about 253 (or 1024) more process' worth of open files. That
>>>>>> could be big proportionally when RLIMIT_NPROC is low.
>>>>>>
>>>>>> I don't know if it matters.  It maybe reads like an oversight though.
>>>>>>
>>>>>> (If it does matter, it might be cleanest to change too_many_unix_fds()
>>>>>> to get rid of the "slop".  Since that may be different between af_unix
>>>>>> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the
>>>>>> number of inflight files we want to add.)
>>>>> I don't think it matters. The files in the fixed file set have already
>>>>> been opened by the application, so it counts towards the number of open
>>>>> files that is allowed to have. I don't think we should impose further
>>>>> limits on top of that.
>>>> A process can open one io_uring and 199 other files.  Register the 199
>>>> files in the io_uring, then close their file descriptors.  The main
>>>> NOFILE limit only counts file descriptors.  So then you can open one
>>>> io_uring, 198 other files, and repeat.
>>>>
>>>> You're right, I had forgotten the memlock limit on io_uring.  That makes
>>>> it much less of a practical problem.
>>>>
>>>> But it raises a second point.  It's not just that it lets users allocate
>>>> more files.  You might not want to be limited by user->unix_inflight.
>>>> But you are calling unix_inflight(), which increments it!  Then if
>>>> unix->inflight exceeds the NOFILE limit, you will avoid seeing any
>>>> errors with io_uring, but the user will not be able to send files over
>>>> unix sockets.
>>>>
>>>> So I think this is confusing to read, and confusing to troubleshoot if
>>>> the limit is ever hit.
>>>>
>>>> I would be happy if io_uring didn't increment user->unix_inflight.  I'm
>>>> not sure what the best way is to arrange that.
>>> How about we just do something like the below? I think that's the saner
>>> approach, rather than bypass user->unix_inflight. It's literally the
>>> same thing.
>>>
>>>
>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>> index a4973af1c272..5196b3aa935e 100644
>>> --- a/fs/io_uring.c
>>> +++ b/fs/io_uring.c
>>> @@ -2041,6 +2041,13 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
>>>    	struct sk_buff *skb;
>>>    	int i;
>>>    
>>> +	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
>>> +		struct user_struct *user = ctx->user;
>>> +
>>> +		if (user->unix_inflight > task_rlimit(current, RLIMIT_NOFILE))
>>> +			return -EMFILE;
>>> +	}
>>> +
>>>    	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
>>>    	if (!fpl)
>>>    		return -ENOMEM;
>>>
>>>
>> Welp, you gave me exactly what I asked for.  So now I'd better be
>> positive about it :-D.
> ;-)
>
>> I hope this will be documented accurately, at least where the EMFILE
>> result is explained for this syscall.
> How's this:
>
> http://git.kernel.dk/cgit/liburing/commit/?id=37e48698a09aa1e37690f8fa6dfd8da69a48ee60

+.B EMFILE
+.BR IORING_REGISTER_FILES
+was specified and adding
+.I nr_args
+file references would exceed the maximum allowed number of files the process
+is allowed to have according to the
+.B
+RLIMIT_NOFILE
+resource limit and the caller does not have
+.B CAP_SYS_RESOURCE
+capability.
+.TP

I was struggling with this.  The POSIX part of RLIMIT_NOFILE is applied 
per-process.  But the part we're talking about here, the Linux-specific 
"unix_inflight" resource, is actually accounted per-user.  It's like 
RLIMIT_NPROC.  The value of RLIMIT_NPROC is per-process, but the 
resource it limits is counted in user->processes.

This subtlety of the NOFILE limit is not made clear in the text above, 
nor in unix(7), nor in getrlimit(2).  I would interpret all these docs 
as saying this limit is a per-process thing - I think they are misleading.

IORING_MAX_FIXED_FILES is being raised to 1024, which is the same as the 
(soft limit) value for RLIMIT_NOFILE which the kernel sets for the init 
process.  I have an unjustifiable nervousness, that there will be some 
`fio` command, or a test written that maxes out IORING_REGISTER_FILES.  
When you do that, it will provoke unexpected failures e.g. in GUI apps.  
If we can't rule that out, the next best thing is a friendly man page.

Regards
Alan

>> Because EMFILE is different from the errno in af_unix.c, I will add a
>> wish for the existing documentation of ETOOMANYREFS in unix(7) to
>> reference this.
>>
>> I'll stop bikeshedding there.  EMFILE sounds ok.  strerror() calls
>> ETOOMANYREFS "Too many references: cannot splice"; it doesn't seem to be
>> particularly helpful or well-known.
> Agree
>
Jens Axboe Feb. 12, 2019, 9:10 p.m. UTC | #10
On 2/12/19 1:23 PM, Alan Jenkins wrote:
> On 12/02/2019 17:33, Jens Axboe wrote:
>> On 2/12/19 10:21 AM, Alan Jenkins wrote:
>>> On 12/02/2019 15:17, Jens Axboe wrote:
>>>> On 2/12/19 5:29 AM, Alan Jenkins wrote:
>>>>> On 08/02/2019 15:13, Jens Axboe wrote:
>>>>>> On 2/8/19 7:02 AM, Alan Jenkins wrote:
>>>>>>> On 08/02/2019 12:57, Jens Axboe wrote:
>>>>>>>> On 2/8/19 5:17 AM, Alan Jenkins wrote:
>>>>>>>>>> +static int io_sqe_files_scm(struct io_ring_ctx *ctx)
>>>>>>>>>> +{
>>>>>>>>>> +#if defined(CONFIG_NET)
>>>>>>>>>> +	struct scm_fp_list *fpl = ctx->user_files;
>>>>>>>>>> +	struct sk_buff *skb;
>>>>>>>>>> +	int i;
>>>>>>>>>> +
>>>>>>>>>> +	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
>>>>>>>>>> +	if (!skb)
>>>>>>>>>> +		return -ENOMEM;
>>>>>>>>>> +
>>>>>>>>>> +	skb->sk = ctx->ring_sock->sk;
>>>>>>>>>> +	skb->destructor = unix_destruct_scm;
>>>>>>>>>> +
>>>>>>>>>> +	fpl->user = get_uid(ctx->user);
>>>>>>>>>> +	for (i = 0; i < fpl->count; i++) {
>>>>>>>>>> +		get_file(fpl->fp[i]);
>>>>>>>>>> +		unix_inflight(fpl->user, fpl->fp[i]);
>>>>>>>>>> +		fput(fpl->fp[i]);
>>>>>>>>>> +	}
>>>>>>>>>> +
>>>>>>>>>> +	UNIXCB(skb).fp = fpl;
>>>>>>>>>> +	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
>>>>>>>>> This code sounds elegant if you know about the existence of unix_gc(),
>>>>>>>>> but quite mysterious if you don't.  (E.g. why "inflight"?)  Could we
>>>>>>>>> have a brief comment, to comfort mortal readers on their journey?
>>>>>>>>>
>>>>>>>>> /* A message on a unix socket can hold a reference to a file. This can
>>>>>>>>> cause a reference cycle. So there is a garbage collector for unix
>>>>>>>>> sockets, which we hook into here. */
>>>>>>>> Yes that's a good idea, I've added a comment as to why we go through the
>>>>>>>> trouble of doing this socket + skb dance.
>>>>>>> Great, thanks.
>>>>>>>
>>>>>>>>> I think this is bypassing too_many_unix_fds() though?  I understood that
>>>>>>>>> was intended to bound kernel memory allocation, at least in principle.
>>>>>>>> As the code stands above, it'll cap it at 253. I'm just now reworking it
>>>>>>>> to NOT be limited to the SCM max fd count, but still impose a limit of
>>>>>>>> 1024 on the number of registered files. This is important to cap the
>>>>>>>> memory allocation attempt as well.
>>>>>>> I saw you were limiting to SCM_MAX_FD per io_uring.  On the other hand,
>>>>>>> there's no specific limit on the number of io_urings you can open (only
>>>>>>> the standard limits on fds).  So this would let you allocate hundreds of
>>>>>>> times more files than the previous limit RLIMIT_NOFILE...
>>>>>> But there is, the io_uring itself is under the memlock rlimit.
>>>>>>
>>>>>>> static inline bool too_many_unix_fds(struct task_struct *p)
>>>>>>> {
>>>>>>> 	struct user_struct *user = current_user();
>>>>>>>
>>>>>>> 	if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
>>>>>>> 		return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
>>>>>>> 	return false;
>>>>>>> }
>>>>>>>
>>>>>>> RLIMIT_NOFILE is technically per-task, but here it is capping
>>>>>>> unix_inflight per-user.  So the way I look at this, the number of file
>>>>>>> descriptors per user is bounded by NOFILE * NPROC.  Then
>>>>>>> user->unix_inflight can have one additional process' worth (NOFILE) of
>>>>>>> "inflight" files.  (Plus SCM_MAX_FD slop, because too_many_fds() is only
>>>>>>> called once per SCM_RIGHTS).
>>>>>>>
>>>>>>> Because io_uring doesn't check too_many_unix_fds(), I think it will let
>>>>>>> you have about 253 (or 1024) more process' worth of open files. That
>>>>>>> could be big proportionally when RLIMIT_NPROC is low.
>>>>>>>
>>>>>>> I don't know if it matters.  It maybe reads like an oversight though.
>>>>>>>
>>>>>>> (If it does matter, it might be cleanest to change too_many_unix_fds()
>>>>>>> to get rid of the "slop".  Since that may be different between af_unix
>>>>>>> and io_uring; 253 v.s. 1024 or whatever. E.g. add a parameter for the
>>>>>>> number of inflight files we want to add.)
>>>>>> I don't think it matters. The files in the fixed file set have already
>>>>>> been opened by the application, so it counts towards the number of open
>>>>>> files that is allowed to have. I don't think we should impose further
>>>>>> limits on top of that.
>>>>> A process can open one io_uring and 199 other files.  Register the 199
>>>>> files in the io_uring, then close their file descriptors.  The main
>>>>> NOFILE limit only counts file descriptors.  So then you can open one
>>>>> io_uring, 198 other files, and repeat.
>>>>>
>>>>> You're right, I had forgotten the memlock limit on io_uring.  That makes
>>>>> it much less of a practical problem.
>>>>>
>>>>> But it raises a second point.  It's not just that it lets users allocate
>>>>> more files.  You might not want to be limited by user->unix_inflight.
>>>>> But you are calling unix_inflight(), which increments it!  Then if
>>>>> unix->inflight exceeds the NOFILE limit, you will avoid seeing any
>>>>> errors with io_uring, but the user will not be able to send files over
>>>>> unix sockets.
>>>>>
>>>>> So I think this is confusing to read, and confusing to troubleshoot if
>>>>> the limit is ever hit.
>>>>>
>>>>> I would be happy if io_uring didn't increment user->unix_inflight.  I'm
>>>>> not sure what the best way is to arrange that.
>>>> How about we just do something like the below? I think that's the saner
>>>> approach, rather than bypass user->unix_inflight. It's literally the
>>>> same thing.
>>>>
>>>>
>>>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>>>> index a4973af1c272..5196b3aa935e 100644
>>>> --- a/fs/io_uring.c
>>>> +++ b/fs/io_uring.c
>>>> @@ -2041,6 +2041,13 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
>>>>    	struct sk_buff *skb;
>>>>    	int i;
>>>>    
>>>> +	if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
>>>> +		struct user_struct *user = ctx->user;
>>>> +
>>>> +		if (user->unix_inflight > task_rlimit(current, RLIMIT_NOFILE))
>>>> +			return -EMFILE;
>>>> +	}
>>>> +
>>>>    	fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
>>>>    	if (!fpl)
>>>>    		return -ENOMEM;
>>>>
>>>>
>>> Welp, you gave me exactly what I asked for.  So now I'd better be
>>> positive about it :-D.
>> ;-)
>>
>>> I hope this will be documented accurately, at least where the EMFILE
>>> result is explained for this syscall.
>> How's this:
>>
>> http://git.kernel.dk/cgit/liburing/commit/?id=37e48698a09aa1e37690f8fa6dfd8da69a48ee60
> 
> +.B EMFILE
> +.BR IORING_REGISTER_FILES
> +was specified and adding
> +.I nr_args
> +file references would exceed the maximum allowed number of files the process
> +is allowed to have according to the
> +.B
> +RLIMIT_NOFILE
> +resource limit and the caller does not have
> +.B CAP_SYS_RESOURCE
> +capability.
> +.TP
> 
> I was struggling with this.  The POSIX part of RLIMIT_NOFILE is applied 
> per-process.  But the part we're talking about here, the Linux-specific 
> "unix_inflight" resource, is actually accounted per-user.  It's like 
> RLIMIT_NPROC.  The value of RLIMIT_NPROC is per-process, but the 
> resource it limits is counted in user->processes.
> 
> This subtlety of the NOFILE limit is not made clear in the text above, 
> nor in unix(7), nor in getrlimit(2).  I would interpret all these docs 
> as saying this limit is a per-process thing - I think they are misleading.

Fair point, I'll add an update to clearly state it's a per process
limit.

> IORING_MAX_FIXED_FILES is being raised to 1024, which is the same as the 
> (soft limit) value for RLIMIT_NOFILE which the kernel sets for the init 
> process.  I have an unjustifiable nervousness, that there will be some 
> `fio` command, or a test written that maxes out IORING_REGISTER_FILES.  
> When you do that, it will provoke unexpected failures e.g. in GUI apps.  
> If we can't rule that out, the next best thing is a friendly man page.

If we apply the limit to sendmsg and friends, it should be applied here
as well.
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9d6233dc35ca..f2550efec60d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -29,6 +29,7 @@ 
 #include <linux/net.h>
 #include <net/sock.h>
 #include <net/af_unix.h>
+#include <net/scm.h>
 #include <linux/anon_inodes.h>
 #include <linux/sched/mm.h>
 #include <linux/uaccess.h>
@@ -101,6 +102,13 @@  struct io_ring_ctx {
 		struct fasync_struct	*cq_fasync;
 	} ____cacheline_aligned_in_smp;
 
+	/*
+	 * If used, fixed file set. Writers must ensure that ->refs is dead,
+	 * readers must ensure that ->refs is alive as long as the file* is
+	 * used. Only updated through io_uring_register(2).
+	 */
+	struct scm_fp_list	*user_files;
+
 	/* if used, fixed mapped user buffers */
 	unsigned		nr_user_bufs;
 	struct io_mapped_ubuf	*user_bufs;
@@ -148,6 +156,7 @@  struct io_kiocb {
 	unsigned int		flags;
 #define REQ_F_FORCE_NONBLOCK	1	/* inline submission attempt */
 #define REQ_F_IOPOLL_COMPLETED	2	/* polled IO has completed */
+#define REQ_F_FIXED_FILE	4	/* ctx owns file */
 	u64			user_data;
 	u64			error;
 
@@ -374,15 +383,17 @@  static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		 * Batched puts of the same file, to avoid dirtying the
 		 * file usage count multiple times, if avoidable.
 		 */
-		if (!file) {
-			file = req->rw.ki_filp;
-			file_count = 1;
-		} else if (file == req->rw.ki_filp) {
-			file_count++;
-		} else {
-			fput_many(file, file_count);
-			file = req->rw.ki_filp;
-			file_count = 1;
+		if (!(req->flags & REQ_F_FIXED_FILE)) {
+			if (!file) {
+				file = req->rw.ki_filp;
+				file_count = 1;
+			} else if (file == req->rw.ki_filp) {
+				file_count++;
+			} else {
+				fput_many(file, file_count);
+				file = req->rw.ki_filp;
+				file_count = 1;
+			}
 		}
 
 		if (to_free == ARRAY_SIZE(reqs))
@@ -514,13 +525,19 @@  static void kiocb_end_write(struct kiocb *kiocb)
 	}
 }
 
+static void io_fput(struct io_kiocb *req)
+{
+	if (!(req->flags & REQ_F_FIXED_FILE))
+		fput(req->rw.ki_filp);
+}
+
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw);
 
 	kiocb_end_write(kiocb);
 
-	fput(kiocb->ki_filp);
+	io_fput(req);
 	io_cqring_add_event(req->ctx, req->user_data, res, 0);
 	io_free_req(req);
 }
@@ -636,19 +653,29 @@  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct kiocb *kiocb = &req->rw;
-	unsigned ioprio;
+	unsigned ioprio, flags;
 	int fd, ret;
 
 	/* For -EAGAIN retry, everything is already prepped */
 	if (kiocb->ki_filp)
 		return 0;
 
+	flags = READ_ONCE(sqe->flags);
 	fd = READ_ONCE(sqe->fd);
-	kiocb->ki_filp = io_file_get(state, fd);
-	if (unlikely(!kiocb->ki_filp))
-		return -EBADF;
-	if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
-		force_nonblock = false;
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files ||
+		    (unsigned) fd >= ctx->user_files->count))
+			return -EBADF;
+		kiocb->ki_filp = ctx->user_files->fp[fd];
+		req->flags |= REQ_F_FIXED_FILE;
+	} else {
+		kiocb->ki_filp = io_file_get(state, fd);
+		if (unlikely(!kiocb->ki_filp))
+			return -EBADF;
+		if (force_nonblock && !io_file_supports_async(kiocb->ki_filp))
+			force_nonblock = false;
+	}
 	kiocb->ki_pos = READ_ONCE(sqe->off);
 	kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
 	kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
@@ -688,10 +715,14 @@  static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	}
 	return 0;
 out_fput:
-	/* in case of error, we didn't use this file reference. drop it. */
-	if (state)
-		state->used_refs--;
-	io_file_put(state, kiocb->ki_filp);
+	if (!(flags & IOSQE_FIXED_FILE)) {
+		/*
+		 * in case of error, we didn't use this file reference. drop it.
+		 */
+		if (state)
+			state->used_refs--;
+		io_file_put(state, kiocb->ki_filp);
+	}
 	return ret;
 }
 
@@ -823,7 +854,7 @@  static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s,
 out_fput:
 	/* Hold on to the file for -EAGAIN */
 	if (unlikely(ret && ret != -EAGAIN))
-		fput(file);
+		io_fput(req);
 	return ret;
 }
 
@@ -877,7 +908,7 @@  static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s,
 	kfree(iovec);
 out_fput:
 	if (unlikely(ret))
-		fput(file);
+		io_fput(req);
 	return ret;
 }
 
@@ -903,7 +934,7 @@  static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	loff_t sqe_off = READ_ONCE(sqe->off);
 	loff_t sqe_len = READ_ONCE(sqe->len);
 	loff_t end = sqe_off + sqe_len;
-	unsigned fsync_flags;
+	unsigned fsync_flags, flags;
 	struct file *file;
 	int ret, fd;
 
@@ -921,14 +952,23 @@  static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		return -EINVAL;
 
 	fd = READ_ONCE(sqe->fd);
-	file = fget(fd);
+	flags = READ_ONCE(sqe->flags);
+
+	if (flags & IOSQE_FIXED_FILE) {
+		if (unlikely(!ctx->user_files || fd >= ctx->user_files->count))
+			return -EBADF;
+		file = ctx->user_files->fp[fd];
+	} else {
+		file = fget(fd);
+	}
 	if (unlikely(!file))
 		return -EBADF;
 
 	ret = vfs_fsync_range(file, sqe_off, end > 0 ? end : LLONG_MAX,
 				fsync_flags & IORING_FSYNC_DATASYNC);
 
-	fput(file);
+	if (!(flags & IOSQE_FIXED_FILE))
+		fput(file);
 	io_cqring_add_event(ctx, sqe->user_data, ret, 0);
 	io_free_req(req);
 	return 0;
@@ -1065,7 +1105,7 @@  static int io_submit_sqe(struct io_ring_ctx *ctx, const struct sqe_submit *s,
 	ssize_t ret;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(s->sqe->flags))
+	if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE))
 		return -EINVAL;
 
 	req = io_get_req(ctx, state);
@@ -1253,6 +1293,104 @@  static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	return ring->r.head == ring->r.tail ? ret : 0;
 }
 
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_NET)
+	if (ctx->ring_sock) {
+		struct sock *sock = ctx->ring_sock->sk;
+		struct sk_buff *skb;
+
+		while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
+			kfree_skb(skb);
+	}
+#else
+	int i;
+
+	for (i = 0; i < ctx->user_files->count; i++)
+		fput(ctx->user_files->fp[i]);
+
+	kfree(ctx->user_files);
+#endif
+}
+
+static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
+{
+	if (!ctx->user_files)
+		return -ENXIO;
+
+	__io_sqe_files_unregister(ctx);
+	ctx->user_files = NULL;
+	return 0;
+}
+
+static int io_sqe_files_scm(struct io_ring_ctx *ctx)
+{
+#if defined(CONFIG_NET)
+	struct scm_fp_list *fpl = ctx->user_files;
+	struct sk_buff *skb;
+	int i;
+
+	skb =  __alloc_skb(0, GFP_KERNEL, 0, NUMA_NO_NODE);
+	if (!skb)
+		return -ENOMEM;
+
+	skb->sk = ctx->ring_sock->sk;
+	skb->destructor = unix_destruct_scm;
+
+	fpl->user = get_uid(ctx->user);
+	for (i = 0; i < fpl->count; i++) {
+		get_file(fpl->fp[i]);
+		unix_inflight(fpl->user, fpl->fp[i]);
+		fput(fpl->fp[i]);
+	}
+
+	UNIXCB(skb).fp = fpl;
+	skb_queue_head(&ctx->ring_sock->sk->sk_receive_queue, skb);
+#endif
+	return 0;
+}
+
+static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
+				 unsigned nr_args)
+{
+	__s32 __user *fds = (__s32 __user *) arg;
+	struct scm_fp_list *fpl;
+	int fd, ret = 0;
+	unsigned i;
+
+	if (ctx->user_files)
+		return -EBUSY;
+	if (!nr_args || nr_args > SCM_MAX_FD)
+		return -EINVAL;
+
+	fpl = kzalloc(sizeof(*ctx->user_files), GFP_KERNEL);
+	if (!fpl)
+		return -ENOMEM;
+	fpl->max = nr_args;
+
+	for (i = 0; i < nr_args; i++) {
+		ret = -EFAULT;
+		if (copy_from_user(&fd, &fds[i], sizeof(fd)))
+			break;
+
+		fpl->fp[i] = fget(fd);
+
+		ret = -EBADF;
+		if (!fpl->fp[i])
+			break;
+		fpl->count++;
+		ret = 0;
+	}
+
+	ctx->user_files = fpl;
+	if (!ret)
+		ret = io_sqe_files_scm(ctx);
+	if (ret)
+		io_sqe_files_unregister(ctx);
+
+	return ret;
+}
+
 static int io_sq_offload_start(struct io_ring_ctx *ctx)
 {
 	int ret;
@@ -1520,14 +1658,16 @@  static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		destroy_workqueue(ctx->sqo_wq);
 	if (ctx->sqo_mm)
 		mmdrop(ctx->sqo_mm);
+
+	io_iopoll_reap_events(ctx);
+	io_sqe_buffer_unregister(ctx);
+	io_sqe_files_unregister(ctx);
+
 #if defined(CONFIG_NET)
 	if (ctx->ring_sock)
 		sock_release(ctx->ring_sock);
 #endif
 
-	io_iopoll_reap_events(ctx);
-	io_sqe_buffer_unregister(ctx);
-
 	io_mem_free(ctx->sq_ring);
 	io_mem_free(ctx->sq_sqes);
 	io_mem_free(ctx->cq_ring);
@@ -1885,6 +2025,15 @@  static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_sqe_buffer_unregister(ctx);
 		break;
+	case IORING_REGISTER_FILES:
+		ret = io_sqe_files_register(ctx, arg, nr_args);
+		break;
+	case IORING_UNREGISTER_FILES:
+		ret = -EINVAL;
+		if (arg || nr_args)
+			break;
+		ret = io_sqe_files_unregister(ctx);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/net/af_unix.h b/include/net/af_unix.h
index ddbba838d048..3426d6dacc45 100644
--- a/include/net/af_unix.h
+++ b/include/net/af_unix.h
@@ -10,6 +10,7 @@ 
 
 void unix_inflight(struct user_struct *user, struct file *fp);
 void unix_notinflight(struct user_struct *user, struct file *fp);
+void unix_destruct_scm(struct sk_buff *skb);
 void unix_gc(void);
 void wait_for_unix_gc(void);
 struct sock *unix_get_socket(struct file *filp);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index cf28f7a11f12..6257478d55e9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -16,7 +16,7 @@ 
  */
 struct io_uring_sqe {
 	__u8	opcode;		/* type of operation for this sqe */
-	__u8	flags;		/* as of now unused */
+	__u8	flags;		/* IOSQE_ flags */
 	__u16	ioprio;		/* ioprio for the request */
 	__s32	fd;		/* file descriptor to do IO on */
 	__u64	off;		/* offset into file */
@@ -33,6 +33,11 @@  struct io_uring_sqe {
 	};
 };
 
+/*
+ * sqe->flags
+ */
+#define IOSQE_FIXED_FILE	(1U << 0)	/* use fixed fileset */
+
 /*
  * io_uring_setup() flags
  */
@@ -113,5 +118,7 @@  struct io_uring_params {
  */
 #define IORING_REGISTER_BUFFERS		0
 #define IORING_UNREGISTER_BUFFERS	1
+#define IORING_REGISTER_FILES		2
+#define IORING_UNREGISTER_FILES		3
 
 #endif
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 74d1eed7cbd4..9b1bbf74c4ea 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1497,7 +1497,7 @@  static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 		unix_notinflight(scm->fp->user, scm->fp->fp[i]);
 }
 
-static void unix_destruct_scm(struct sk_buff *skb)
+void unix_destruct_scm(struct sk_buff *skb)
 {
 	struct scm_cookie scm;
 	memset(&scm, 0, sizeof(scm));