diff mbox series

[07/22] aio: add io_setup2() system call

Message ID 20181221192236.12866-8-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series [01/22] fs: add an iopoll method to struct file_operations | expand

Commit Message

Jens Axboe Dec. 21, 2018, 7:22 p.m. UTC
This is just like io_setup(), except add a flags argument to let the
caller control/define some of the io_context behavior.

Outside of the flags, we add two user pointers for future use.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 Documentation/sysctl/fs.txt            |  8 +--
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/aio.c                               | 76 +++++++++++++++++---------
 include/linux/syscalls.h               |  2 +
 include/uapi/asm-generic/unistd.h      |  4 +-
 kernel/sys_ni.c                        |  1 +
 6 files changed, 62 insertions(+), 30 deletions(-)

Comments

Christoph Hellwig Dec. 27, 2018, 1:55 p.m. UTC | #1
On Fri, Dec 21, 2018 at 12:22:21PM -0700, Jens Axboe wrote:
> This is just like io_setup(), except add a flags argument to let the
> caller control/define some of the io_context behavior.
> 
> Outside of the flags, we add two user pointers for future use.
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>

Hmm, I don't think think I was fine with the current version..

> +/* sys_io_setup2:
> + *	Like sys_io_setup(), except that it takes a set of flags
> + *	(IOCTX_FLAG_*), and some pointers to user structures:
> + *
> + *	*user1 - reserved for future use
> + *
> + *	*user2 - reserved for future use.
> + */
> +SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags, void __user *, user1,
> +		void __user *, user2, aio_context_t __user *, ctxp)

Most importantly I'm really worried about these new user1 and user2
fields, which aren't really used for aio poll.  They are used for
the I/O ring, which while I think is a really great concept, I am also
worried about overlaying into the aio interface just creating confusion..
Jens Axboe Dec. 27, 2018, 8:27 p.m. UTC | #2
On 12/27/18 6:55 AM, Christoph Hellwig wrote:
> On Fri, Dec 21, 2018 at 12:22:21PM -0700, Jens Axboe wrote:
>> This is just like io_setup(), except add a flags argument to let the
>> caller control/define some of the io_context behavior.
>>
>> Outside of the flags, we add two user pointers for future use.
>>
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> Hmm, I don't think think I was fine with the current version..

https://marc.info/?l=linux-fsdevel&m=154539128525404&w=2

>> +/* sys_io_setup2:
>> + *	Like sys_io_setup(), except that it takes a set of flags
>> + *	(IOCTX_FLAG_*), and some pointers to user structures:
>> + *
>> + *	*user1 - reserved for future use
>> + *
>> + *	*user2 - reserved for future use.
>> + */
>> +SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags, void __user *, user1,
>> +		void __user *, user2, aio_context_t __user *, ctxp)
> 
> Most importantly I'm really worried about these new user1 and user2
> fields, which aren't really used for aio poll.  They are used for
> the I/O ring, which while I think is a really great concept, I am also
> worried about overlaying into the aio interface just creating confusion..

As I said before, I don't want to create something entirely new just for
the sake of it. We end up reusing basically all aio data structures and
code paths, of course with some deviations here and there.

I don't see why it would cause confusion. Existing aio users are going
to be using io_setup(2) and nothing changes on that front. If you want
polled aio without using the ring, you can do so with io_setup2(2).
The fact that the ring addition ends up being setup in the same
fashion doesn't seem problematic to me. For the polled part, 99%
of the code ends up being the same.
diff mbox series

Patch

diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt
index 819caf8ca05f..5e484eb7a25f 100644
--- a/Documentation/sysctl/fs.txt
+++ b/Documentation/sysctl/fs.txt
@@ -47,10 +47,10 @@  Currently, these files are in /proc/sys/fs:
 aio-nr & aio-max-nr:
 
 aio-nr is the running total of the number of events specified on the
-io_setup system call for all currently active aio contexts.  If aio-nr
-reaches aio-max-nr then io_setup will fail with EAGAIN.  Note that
-raising aio-max-nr does not result in the pre-allocation or re-sizing
-of any kernel data structures.
+io_setup/io_setup2 system call for all currently active aio contexts.
+If aio-nr reaches aio-max-nr then io_setup will fail with EAGAIN.
+Note that raising aio-max-nr does not result in the pre-allocation or
+re-sizing of any kernel data structures.
 
 ==============================================================
 
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index f0b1709a5ffb..67c357225fb0 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -343,6 +343,7 @@ 
 332	common	statx			__x64_sys_statx
 333	common	io_pgetevents		__x64_sys_io_pgetevents
 334	common	rseq			__x64_sys_rseq
+335	common	io_setup2		__x64_sys_io_setup2
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/aio.c b/fs/aio.c
index e2882334b48f..958f432a0e5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -101,6 +101,8 @@  struct kioctx {
 
 	unsigned long		user_id;
 
+	unsigned int		flags;
+
 	struct __percpu kioctx_cpu *cpu;
 
 	/*
@@ -687,10 +689,8 @@  static void aio_nr_sub(unsigned nr)
 	spin_unlock(&aio_nr_lock);
 }
 
-/* ioctx_alloc
- *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
- */
-static struct kioctx *ioctx_alloc(unsigned nr_events)
+static struct kioctx *io_setup_flags(unsigned long ctxid,
+				     unsigned int nr_events, unsigned int flags)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
@@ -702,6 +702,12 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 	 */
 	unsigned int max_reqs = nr_events;
 
+	if (unlikely(ctxid || nr_events == 0)) {
+		pr_debug("EINVAL: ctx %lu nr_events %u\n",
+		         ctxid, nr_events);
+		return ERR_PTR(-EINVAL);
+	}
+
 	/*
 	 * We keep track of the number of available ringbuffer slots, to prevent
 	 * overflow (reqs_available), and we also use percpu counters for this.
@@ -727,6 +733,7 @@  static struct kioctx *ioctx_alloc(unsigned nr_events)
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
+	ctx->flags = flags;
 	ctx->max_reqs = max_reqs;
 
 	spin_lock_init(&ctx->ctx_lock);
@@ -1283,6 +1290,41 @@  static long read_events(struct kioctx *ctx, long min_nr, long nr,
 	return ret;
 }
 
+/* sys_io_setup2:
+ *	Like sys_io_setup(), except that it takes a set of flags
+ *	(IOCTX_FLAG_*), and some pointers to user structures:
+ *
+ *	*user1 - reserved for future use
+ *
+ *	*user2 - reserved for future use.
+ */
+SYSCALL_DEFINE5(io_setup2, u32, nr_events, u32, flags, void __user *, user1,
+		void __user *, user2, aio_context_t __user *, ctxp)
+{
+	struct kioctx *ioctx;
+	unsigned long ctx;
+	long ret;
+
+	if (flags || user1 || user2)
+		return -EINVAL;
+
+	ret = get_user(ctx, ctxp);
+	if (unlikely(ret))
+		goto out;
+
+	ioctx = io_setup_flags(ctx, nr_events, flags);
+	ret = PTR_ERR(ioctx);
+	if (IS_ERR(ioctx))
+		goto out;
+
+	ret = put_user(ioctx->user_id, ctxp);
+	if (ret)
+		kill_ioctx(current->mm, ioctx, NULL);
+	percpu_ref_put(&ioctx->users);
+out:
+	return ret;
+}
+
 /* sys_io_setup:
  *	Create an aio_context capable of receiving at least nr_events.
  *	ctxp must not point to an aio_context that already exists, and
@@ -1298,7 +1340,7 @@  static long read_events(struct kioctx *ctx, long min_nr, long nr,
  */
 SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 {
-	struct kioctx *ioctx = NULL;
+	struct kioctx *ioctx;
 	unsigned long ctx;
 	long ret;
 
@@ -1306,14 +1348,7 @@  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 	if (unlikely(ret))
 		goto out;
 
-	ret = -EINVAL;
-	if (unlikely(ctx || nr_events == 0)) {
-		pr_debug("EINVAL: ctx %lu nr_events %u\n",
-		         ctx, nr_events);
-		goto out;
-	}
-
-	ioctx = ioctx_alloc(nr_events);
+	ioctx = io_setup_flags(ctx, nr_events, 0);
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
 		ret = put_user(ioctx->user_id, ctxp);
@@ -1329,7 +1364,7 @@  SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
 {
-	struct kioctx *ioctx = NULL;
+	struct kioctx *ioctx;
 	unsigned long ctx;
 	long ret;
 
@@ -1337,23 +1372,14 @@  COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
 	if (unlikely(ret))
 		goto out;
 
-	ret = -EINVAL;
-	if (unlikely(ctx || nr_events == 0)) {
-		pr_debug("EINVAL: ctx %lu nr_events %u\n",
-		         ctx, nr_events);
-		goto out;
-	}
-
-	ioctx = ioctx_alloc(nr_events);
+	ioctx = io_setup_flags(ctx, nr_events, 0);
 	ret = PTR_ERR(ioctx);
 	if (!IS_ERR(ioctx)) {
-		/* truncating is ok because it's a user address */
-		ret = put_user((u32)ioctx->user_id, ctx32p);
+		ret = put_user(ioctx->user_id, ctx32p);
 		if (ret)
 			kill_ioctx(current->mm, ioctx, NULL);
 		percpu_ref_put(&ioctx->users);
 	}
-
 out:
 	return ret;
 }
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 2ac3d13a915b..67b7f03aa9fc 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -287,6 +287,8 @@  static inline void addr_limit_user_check(void)
  */
 #ifndef CONFIG_ARCH_HAS_SYSCALL_WRAPPER
 asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx);
+asmlinkage long sys_io_setup2(unsigned, unsigned, void __user *, void __user *,
+				aio_context_t __user *);
 asmlinkage long sys_io_destroy(aio_context_t ctx);
 asmlinkage long sys_io_submit(aio_context_t, long,
 			struct iocb __user * __user *);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index c7f3321fbe43..1bbaa4c59f20 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -738,9 +738,11 @@  __SYSCALL(__NR_statx,     sys_statx)
 __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents)
 #define __NR_rseq 293
 __SYSCALL(__NR_rseq, sys_rseq)
+#define __NR_io_setup2 294
+__SYSCALL(__NR_io_setup2, sys_io_setup2)
 
 #undef __NR_syscalls
-#define __NR_syscalls 294
+#define __NR_syscalls 295
 
 /*
  * 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index df556175be50..17c8b4393669 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -37,6 +37,7 @@  asmlinkage long sys_ni_syscall(void)
  */
 
 COND_SYSCALL(io_setup);
+COND_SYSCALL(io_setup2);
 COND_SYSCALL_COMPAT(io_setup);
 COND_SYSCALL(io_destroy);
 COND_SYSCALL(io_submit);