diff mbox series

[v2,3/3] io_uring: add splice(2) support

Message ID b33d7315f266225237dfd10f483162c51c2ed5bc.1581802973.git.asml.silence@gmail.com (mailing list archive)
State New, archived
Headers show
Series io_uring: add splice(2) support | expand

Commit Message

Pavel Begunkov Feb. 15, 2020, 10:05 p.m. UTC
Add support for splice(2). Out file is handled in generic path,
input file owned cared by splice* bits only.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c                 | 106 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  14 ++++-
 2 files changed, 119 insertions(+), 1 deletion(-)

Comments

Stefan Metzmacher Feb. 17, 2020, 3:18 p.m. UTC | #1
Hi Pavel,

> +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
> +{
> +	struct io_splice* sp = &req->splice;
> +	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
> +	int ret;
> +
> +	if (req->flags & REQ_F_NEED_CLEANUP)
> +		return 0;
> +
> +	sp->file_in = NULL;
> +	sp->off_in = READ_ONCE(sqe->off_in);
> +	sp->off_out = READ_ONCE(sqe->off);
> +	sp->len = READ_ONCE(sqe->len);
> +	sp->flags = READ_ONCE(sqe->splice_flags);
> +
> +	if (unlikely(READ_ONCE(sqe->ioprio) || (sp->flags & ~valid_flags)))
> +		return -EINVAL;

Why is ioprio not supported?

metze
Pavel Begunkov Feb. 17, 2020, 3:40 p.m. UTC | #2
On 2/17/2020 6:18 PM, Stefan Metzmacher wrote:
> Hi Pavel,
> 
>> +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_splice* sp = &req->splice;
>> +	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
>> +	int ret;
>> +
>> +	if (req->flags & REQ_F_NEED_CLEANUP)
>> +		return 0;
>> +
>> +	sp->file_in = NULL;
>> +	sp->off_in = READ_ONCE(sqe->off_in);
>> +	sp->off_out = READ_ONCE(sqe->off);
>> +	sp->len = READ_ONCE(sqe->len);
>> +	sp->flags = READ_ONCE(sqe->splice_flags);
>> +
>> +	if (unlikely(READ_ONCE(sqe->ioprio) || (sp->flags & ~valid_flags)))
>> +		return -EINVAL;
> 
> Why is ioprio not supported?

Because there is no way to set it without changing much of splice code.
It may be added later

BTW, it seems, only opcodes cares about ioprio are read*/write*.
recv*() and send*() don't reject it, but never use.
Stefan Metzmacher Feb. 17, 2020, 3:54 p.m. UTC | #3
Am 17.02.20 um 16:40 schrieb Pavel Begunkov:
> On 2/17/2020 6:18 PM, Stefan Metzmacher wrote:
>> Hi Pavel,
>>
>>> +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>> +{
>>> +	struct io_splice* sp = &req->splice;
>>> +	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
>>> +	int ret;
>>> +
>>> +	if (req->flags & REQ_F_NEED_CLEANUP)
>>> +		return 0;
>>> +
>>> +	sp->file_in = NULL;
>>> +	sp->off_in = READ_ONCE(sqe->off_in);
>>> +	sp->off_out = READ_ONCE(sqe->off);
>>> +	sp->len = READ_ONCE(sqe->len);
>>> +	sp->flags = READ_ONCE(sqe->splice_flags);
>>> +
>>> +	if (unlikely(READ_ONCE(sqe->ioprio) || (sp->flags & ~valid_flags)))
>>> +		return -EINVAL;
>>
>> Why is ioprio not supported?
> 
> Because there is no way to set it without changing much of splice code.
> It may be added later
> 
> BTW, it seems, only opcodes cares about ioprio are read*/write*.
> recv*() and send*() don't reject it, but never use.

I guess it's more like a hint, so should we just ignore it until
it's passed down? Otherwise applications need to do some logic to
find out if they can pass a value or not.

I'm not sure what's better, but I think it needs to be discussed...

metze
Pavel Begunkov Feb. 17, 2020, 3:59 p.m. UTC | #4
On 2/17/2020 6:54 PM, Stefan Metzmacher wrote:
> Am 17.02.20 um 16:40 schrieb Pavel Begunkov:
>> On 2/17/2020 6:18 PM, Stefan Metzmacher wrote:
>>> Hi Pavel,
>>>
>>>> +static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>>>> +{
>>>> +	struct io_splice* sp = &req->splice;
>>>> +	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
>>>> +	int ret;
>>>> +
>>>> +	if (req->flags & REQ_F_NEED_CLEANUP)
>>>> +		return 0;
>>>> +
>>>> +	sp->file_in = NULL;
>>>> +	sp->off_in = READ_ONCE(sqe->off_in);
>>>> +	sp->off_out = READ_ONCE(sqe->off);
>>>> +	sp->len = READ_ONCE(sqe->len);
>>>> +	sp->flags = READ_ONCE(sqe->splice_flags);
>>>> +
>>>> +	if (unlikely(READ_ONCE(sqe->ioprio) || (sp->flags & ~valid_flags)))
>>>> +		return -EINVAL;
>>>
>>> Why is ioprio not supported?
>>
>> Because there is no way to set it without changing much of splice code.
>> It may be added later
>>
>> BTW, it seems, only opcodes cares about ioprio are read*/write*.
>> recv*() and send*() don't reject it, but never use.
> 
> I guess it's more like a hint, so should we just ignore it until
> it's passed down? Otherwise applications need to do some logic to
> find out if they can pass a value or not.

Then it probably needs to validate the value, but not just ignore it

> I'm not sure what's better, but I think it needs to be discussed...

meh, let's see what Jens think
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 389db6f5568b..1c71d848c974 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -76,6 +76,7 @@ 
 #include <linux/fadvise.h>
 #include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
+#include <linux/splice.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -433,6 +434,15 @@  struct io_epoll {
 	struct epoll_event		event;
 };
 
+struct io_splice {
+	struct file			*file_out;
+	struct file			*file_in;
+	loff_t				off_out;
+	loff_t				off_in;
+	u64				len;
+	unsigned int			flags;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -546,6 +556,7 @@  struct io_kiocb {
 		struct io_fadvise	fadvise;
 		struct io_madvise	madvise;
 		struct io_epoll		epoll;
+		struct io_splice	splice;
 	};
 
 	struct io_async_ctx		*io;
@@ -746,6 +757,11 @@  static const struct io_op_def io_op_defs[] = {
 		.unbound_nonreg_file	= 1,
 		.file_table		= 1,
 	},
+	[IORING_OP_SPLICE] = {
+		.needs_file		= 1,
+		.hash_reg_file		= 1,
+		.unbound_nonreg_file	= 1,
+	}
 };
 
 static void io_wq_submit_work(struct io_wq_work **workptr);
@@ -760,6 +776,10 @@  static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 static int io_grab_files(struct io_kiocb *req);
 static void io_ring_file_ref_flush(struct fixed_file_data *data);
 static void io_cleanup_req(struct io_kiocb *req);
+static int io_get_file(struct io_submit_state *state,
+		       struct io_ring_ctx *ctx,
+		       int fd, struct file **out_file,
+		       bool fixed);
 
 static struct kmem_cache *req_cachep;
 
@@ -2412,6 +2432,77 @@  static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
 	return ret;
 }
 
+static int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+	struct io_splice* sp = &req->splice;
+	unsigned int valid_flags = SPLICE_F_FD_IN_FIXED | SPLICE_F_ALL;
+	int ret;
+
+	if (req->flags & REQ_F_NEED_CLEANUP)
+		return 0;
+
+	sp->file_in = NULL;
+	sp->off_in = READ_ONCE(sqe->off_in);
+	sp->off_out = READ_ONCE(sqe->off);
+	sp->len = READ_ONCE(sqe->len);
+	sp->flags = READ_ONCE(sqe->splice_flags);
+
+	if (unlikely(READ_ONCE(sqe->ioprio) || (sp->flags & ~valid_flags)))
+		return -EINVAL;
+
+	ret = io_get_file(NULL, req->ctx, READ_ONCE(sqe->splice_fd_in),
+			   &sp->file_in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+	if (ret)
+		return ret;
+	req->flags |= REQ_F_NEED_CLEANUP;
+
+	if (!S_ISREG(file_inode(sp->file_in)->i_mode))
+		req->work.flags |= IO_WQ_WORK_UNBOUND;
+
+	return 0;
+}
+
+static bool io_splice_punt(struct file *file)
+{
+	if (get_pipe_info(file))
+		return false;
+	if (!io_file_supports_async(file))
+		return true;
+	return !(file->f_mode & O_NONBLOCK);
+}
+
+static int io_splice(struct io_kiocb *req, struct io_kiocb **nxt,
+		     bool force_nonblock)
+{
+	struct io_splice* sp = &req->splice;
+	struct file *in = sp->file_in;
+	struct file *out = sp->file_out;
+	unsigned int flags = sp->flags & ~SPLICE_F_FD_IN_FIXED;
+	loff_t *poff_in, *poff_out;
+	long ret;
+
+	if (force_nonblock) {
+		if (io_splice_punt(in) || io_splice_punt(out))
+			return -EAGAIN;
+		flags |= SPLICE_F_NONBLOCK;
+	}
+
+	poff_in = (sp->off_in == -1) ? NULL : &sp->off_in;
+	poff_out = (sp->off_out == -1) ? NULL : &sp->off_out;
+	ret = do_splice(in, poff_in, out, poff_out, sp->len, flags);
+	if (force_nonblock && ret == -EAGAIN)
+		return -EAGAIN;
+
+	io_put_file(req->ctx, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+
+	io_cqring_add_event(req, ret);
+	if (ret != sp->len)
+		req_set_fail_links(req);
+	io_put_req_find_next(req, nxt);
+	return 0;
+}
+
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
@@ -4227,6 +4318,9 @@  static int io_req_defer_prep(struct io_kiocb *req,
 	case IORING_OP_EPOLL_CTL:
 		ret = io_epoll_ctl_prep(req, sqe);
 		break;
+	case IORING_OP_SPLICE:
+		ret = io_splice_prep(req, sqe);
+		break;
 	default:
 		printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
 				req->opcode);
@@ -4289,6 +4383,10 @@  static void io_cleanup_req(struct io_kiocb *req)
 	case IORING_OP_STATX:
 		putname(req->open.filename);
 		break;
+	case IORING_OP_SPLICE:
+		io_put_file(req->ctx, req->splice.file_in,
+			    (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+		break;
 	}
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -4492,6 +4590,14 @@  static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		}
 		ret = io_epoll_ctl(req, nxt, force_nonblock);
 		break;
+	case IORING_OP_SPLICE:
+		if (sqe) {
+			ret = io_splice_prep(req, sqe);
+			if (ret < 0)
+				break;
+		}
+		ret = io_splice(req, nxt, force_nonblock);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 3f7961c1c243..bc2fe0281de7 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -23,7 +23,10 @@  struct io_uring_sqe {
 		__u64	off;	/* offset into file */
 		__u64	addr2;
 	};
-	__u64	addr;		/* pointer to buffer or iovecs */
+	union {
+		__u64	addr;	/* pointer to buffer or iovecs */
+		__u64	off_in;
+	};
 	__u32	len;		/* buffer size or number of iovecs */
 	union {
 		__kernel_rwf_t	rw_flags;
@@ -37,6 +40,7 @@  struct io_uring_sqe {
 		__u32		open_flags;
 		__u32		statx_flags;
 		__u32		fadvise_advice;
+		__u32		splice_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	union {
@@ -45,6 +49,7 @@  struct io_uring_sqe {
 			__u16	buf_index;
 			/* personality to use, if used */
 			__u16	personality;
+			__u32	splice_fd_in;
 		};
 		__u64	__pad2[3];
 	};
@@ -113,6 +118,7 @@  enum {
 	IORING_OP_RECV,
 	IORING_OP_OPENAT2,
 	IORING_OP_EPOLL_CTL,
+	IORING_OP_SPLICE,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,
@@ -128,6 +134,12 @@  enum {
  */
 #define IORING_TIMEOUT_ABS	(1U << 0)
 
+/*
+ * sqe->splice_flags
+ * extends splice(2) flags
+ */
+#define SPLICE_F_FD_IN_FIXED	(1U << 31) /* the last bit of __u32 */
+
 /*
  * IO completion data structure (Completion Queue Entry)
  */