diff mbox series

[v5,02/10] io_uring: add support for IORING_OP_MKDIRAT

Message ID 20210603051836.2614535-3-dkadashev@gmail.com (mailing list archive)
State New, archived
Headers show
Series io_uring: add mkdir, [sym]linkat and mknodat support | expand

Commit Message

Dmitry Kadashev June 3, 2021, 5:18 a.m. UTC
IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
and arguments.

Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
---
 fs/io_uring.c                 | 55 +++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |  1 +
 2 files changed, 56 insertions(+)

Comments

Pavel Begunkov June 22, 2021, 11:41 a.m. UTC | #1
On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
> and arguments.
> 
> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
> ---
>  fs/io_uring.c                 | 55 +++++++++++++++++++++++++++++++++++
>  include/uapi/linux/io_uring.h |  1 +
>  2 files changed, 56 insertions(+)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index a1ca6badff36..8ab4eb559520 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -665,6 +665,13 @@ struct io_unlink {
>  	struct filename			*filename;
>  };
>  
> +struct io_mkdir {
> +	struct file			*file;
> +	int				dfd;
> +	umode_t				mode;
> +	struct filename			*filename;
> +};
> +
>  struct io_completion {
>  	struct file			*file;
>  	struct list_head		list;
> @@ -809,6 +816,7 @@ struct io_kiocb {
>  		struct io_shutdown	shutdown;
>  		struct io_rename	rename;
>  		struct io_unlink	unlink;
> +		struct io_mkdir		mkdir;
>  		/* use only after cleaning per-op data, see io_clean_op() */
>  		struct io_completion	compl;
>  	};
> @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = {
>  	},
>  	[IORING_OP_RENAMEAT] = {},
>  	[IORING_OP_UNLINKAT] = {},
> +	[IORING_OP_MKDIRAT] = {},
>  };
>  
>  static bool io_disarm_next(struct io_kiocb *req);
> @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
>  	return 0;
>  }
>  
> +static int io_mkdirat_prep(struct io_kiocb *req,
> +			    const struct io_uring_sqe *sqe)
> +{
> +	struct io_mkdir *mkd = &req->mkdir;
> +	const char __user *fname;
> +
> +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
> +		return -EBADF;
> +
> +	mkd->dfd = READ_ONCE(sqe->fd);
> +	mkd->mode = READ_ONCE(sqe->len);
> +
> +	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
> +	mkd->filename = getname(fname);
> +	if (IS_ERR(mkd->filename))
> +		return PTR_ERR(mkd->filename);

We have to check unused fields, e.g. buf_index and off,
to be able to use them in the future if needed. 

if (sqe->buf_index || sqe->off)
	return -EINVAL;

Please double check what fields are not used, and
same goes for all other opcodes.

> +
> +	req->flags |= REQ_F_NEED_CLEANUP;
> +	return 0;
> +}
> +
> +static int io_mkdirat(struct io_kiocb *req, int issue_flags)
> +{
> +	struct io_mkdir *mkd = &req->mkdir;
> +	int ret;
> +
> +	if (issue_flags & IO_URING_F_NONBLOCK)
> +		return -EAGAIN;
> +
> +	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
> +
> +	req->flags &= ~REQ_F_NEED_CLEANUP;
> +	if (ret < 0)
> +		req_set_fail_links(req);
> +	io_req_complete(req, ret);
> +	return 0;
> +}
> +
>  static int io_shutdown_prep(struct io_kiocb *req,
>  			    const struct io_uring_sqe *sqe)
>  {
> @@ -5936,6 +5983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  		return io_renameat_prep(req, sqe);
>  	case IORING_OP_UNLINKAT:
>  		return io_unlinkat_prep(req, sqe);
> +	case IORING_OP_MKDIRAT:
> +		return io_mkdirat_prep(req, sqe);
>  	}
>  
>  	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
> @@ -6077,6 +6126,9 @@ static void io_clean_op(struct io_kiocb *req)
>  		case IORING_OP_UNLINKAT:
>  			putname(req->unlink.filename);
>  			break;
> +		case IORING_OP_MKDIRAT:
> +			putname(req->mkdir.filename);
> +			break;
>  		}
>  		req->flags &= ~REQ_F_NEED_CLEANUP;
>  	}
> @@ -6203,6 +6255,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
>  	case IORING_OP_UNLINKAT:
>  		ret = io_unlinkat(req, issue_flags);
>  		break;
> +	case IORING_OP_MKDIRAT:
> +		ret = io_mkdirat(req, issue_flags);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index e1ae46683301..bf9d720d371f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -137,6 +137,7 @@ enum {
>  	IORING_OP_SHUTDOWN,
>  	IORING_OP_RENAMEAT,
>  	IORING_OP_UNLINKAT,
> +	IORING_OP_MKDIRAT,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
>
Pavel Begunkov June 22, 2021, 11:50 a.m. UTC | #2
On 6/22/21 12:41 PM, Pavel Begunkov wrote:
> On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
>> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
>> and arguments.
>>
>> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
>> ---
>>  fs/io_uring.c                 | 55 +++++++++++++++++++++++++++++++++++
>>  include/uapi/linux/io_uring.h |  1 +
>>  2 files changed, 56 insertions(+)
>>
>> diff --git a/fs/io_uring.c b/fs/io_uring.c
>> index a1ca6badff36..8ab4eb559520 100644
>> --- a/fs/io_uring.c
>> +++ b/fs/io_uring.c
>> @@ -665,6 +665,13 @@ struct io_unlink {
>>  	struct filename			*filename;
>>  };
>>  
>> +struct io_mkdir {
>> +	struct file			*file;
>> +	int				dfd;
>> +	umode_t				mode;
>> +	struct filename			*filename;
>> +};
>> +
>>  struct io_completion {
>>  	struct file			*file;
>>  	struct list_head		list;
>> @@ -809,6 +816,7 @@ struct io_kiocb {
>>  		struct io_shutdown	shutdown;
>>  		struct io_rename	rename;
>>  		struct io_unlink	unlink;
>> +		struct io_mkdir		mkdir;
>>  		/* use only after cleaning per-op data, see io_clean_op() */
>>  		struct io_completion	compl;
>>  	};
>> @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = {
>>  	},
>>  	[IORING_OP_RENAMEAT] = {},
>>  	[IORING_OP_UNLINKAT] = {},
>> +	[IORING_OP_MKDIRAT] = {},
>>  };
>>  
>>  static bool io_disarm_next(struct io_kiocb *req);
>> @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
>>  	return 0;
>>  }
>>  
>> +static int io_mkdirat_prep(struct io_kiocb *req,
>> +			    const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_mkdir *mkd = &req->mkdir;
>> +	const char __user *fname;
>> +
>> +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
>> +		return -EBADF;
>> +
>> +	mkd->dfd = READ_ONCE(sqe->fd);
>> +	mkd->mode = READ_ONCE(sqe->len);
>> +
>> +	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> +	mkd->filename = getname(fname);
>> +	if (IS_ERR(mkd->filename))
>> +		return PTR_ERR(mkd->filename);
> 
> We have to check unused fields, e.g. buf_index and off,
> to be able to use them in the future if needed. 
> 
> if (sqe->buf_index || sqe->off)
> 	return -EINVAL;
> 
> Please double check what fields are not used, and
> same goes for all other opcodes.

+ opcode specific flags, e.g.

if (sqe->rw_flags)
	return -EINVAL;
Pavel Begunkov June 22, 2021, 5:41 p.m. UTC | #3
On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
> and arguments.

Jens, a fold-in er discussed, and it will get you
a conflict at 8/10


diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4b215e0f8dd8..c0e469ebd22d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3589,7 +3589,7 @@ static int io_mkdirat(struct io_kiocb *req, int issue_flags)
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
-		req_set_fail_links(req);
+		req_set_fail(req);
 	io_req_complete(req, ret);
 	return 0;
 }


> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
> ---
>  fs/io_uring.c                 | 55 +++++++++++++++++++++++++++++++++++
>  include/uapi/linux/io_uring.h |  1 +
>  2 files changed, 56 insertions(+)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index a1ca6badff36..8ab4eb559520 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -665,6 +665,13 @@ struct io_unlink {
>  	struct filename			*filename;
>  };
>  
> +struct io_mkdir {
> +	struct file			*file;
> +	int				dfd;
> +	umode_t				mode;
> +	struct filename			*filename;
> +};
> +
>  struct io_completion {
>  	struct file			*file;
>  	struct list_head		list;
> @@ -809,6 +816,7 @@ struct io_kiocb {
>  		struct io_shutdown	shutdown;
>  		struct io_rename	rename;
>  		struct io_unlink	unlink;
> +		struct io_mkdir		mkdir;
>  		/* use only after cleaning per-op data, see io_clean_op() */
>  		struct io_completion	compl;
>  	};
> @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = {
>  	},
>  	[IORING_OP_RENAMEAT] = {},
>  	[IORING_OP_UNLINKAT] = {},
> +	[IORING_OP_MKDIRAT] = {},
>  };
>  
>  static bool io_disarm_next(struct io_kiocb *req);
> @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
>  	return 0;
>  }
>  
> +static int io_mkdirat_prep(struct io_kiocb *req,
> +			    const struct io_uring_sqe *sqe)
> +{
> +	struct io_mkdir *mkd = &req->mkdir;
> +	const char __user *fname;
> +
> +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
> +		return -EBADF;
> +
> +	mkd->dfd = READ_ONCE(sqe->fd);
> +	mkd->mode = READ_ONCE(sqe->len);
> +
> +	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
> +	mkd->filename = getname(fname);
> +	if (IS_ERR(mkd->filename))
> +		return PTR_ERR(mkd->filename);
> +
> +	req->flags |= REQ_F_NEED_CLEANUP;
> +	return 0;
> +}
> +
> +static int io_mkdirat(struct io_kiocb *req, int issue_flags)
> +{
> +	struct io_mkdir *mkd = &req->mkdir;
> +	int ret;
> +
> +	if (issue_flags & IO_URING_F_NONBLOCK)
> +		return -EAGAIN;
> +
> +	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
> +
> +	req->flags &= ~REQ_F_NEED_CLEANUP;
> +	if (ret < 0)
> +		req_set_fail_links(req);
> +	io_req_complete(req, ret);
> +	return 0;
> +}
> +
>  static int io_shutdown_prep(struct io_kiocb *req,
>  			    const struct io_uring_sqe *sqe)
>  {
> @@ -5936,6 +5983,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
>  		return io_renameat_prep(req, sqe);
>  	case IORING_OP_UNLINKAT:
>  		return io_unlinkat_prep(req, sqe);
> +	case IORING_OP_MKDIRAT:
> +		return io_mkdirat_prep(req, sqe);
>  	}
>  
>  	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
> @@ -6077,6 +6126,9 @@ static void io_clean_op(struct io_kiocb *req)
>  		case IORING_OP_UNLINKAT:
>  			putname(req->unlink.filename);
>  			break;
> +		case IORING_OP_MKDIRAT:
> +			putname(req->mkdir.filename);
> +			break;
>  		}
>  		req->flags &= ~REQ_F_NEED_CLEANUP;
>  	}
> @@ -6203,6 +6255,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
>  	case IORING_OP_UNLINKAT:
>  		ret = io_unlinkat(req, issue_flags);
>  		break;
> +	case IORING_OP_MKDIRAT:
> +		ret = io_mkdirat(req, issue_flags);
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index e1ae46683301..bf9d720d371f 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -137,6 +137,7 @@ enum {
>  	IORING_OP_SHUTDOWN,
>  	IORING_OP_RENAMEAT,
>  	IORING_OP_UNLINKAT,
> +	IORING_OP_MKDIRAT,
>  
>  	/* this goes last, obviously */
>  	IORING_OP_LAST,
>
Jens Axboe June 23, 2021, 12:41 a.m. UTC | #4
On 6/22/21 11:41 AM, Pavel Begunkov wrote:
> On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
>> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
>> and arguments.
> 
> Jens, a fold-in er discussed, and it will get you
> a conflict at 8/10

Folded in, thanks.
Dmitry Kadashev June 23, 2021, 5:50 a.m. UTC | #5
On Wed, Jun 23, 2021 at 12:41 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
> > IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
> > and arguments.
>
> Jens, a fold-in er discussed, and it will get you
> a conflict at 8/10

Thanks, Pavel
Dmitry Kadashev June 23, 2021, 6:41 a.m. UTC | #6
On Tue, Jun 22, 2021 at 6:50 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/22/21 12:41 PM, Pavel Begunkov wrote:
> > On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
> >> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
> >> and arguments.
> >>
> >> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
> >> ---
> >>  fs/io_uring.c                 | 55 +++++++++++++++++++++++++++++++++++
> >>  include/uapi/linux/io_uring.h |  1 +
> >>  2 files changed, 56 insertions(+)
> >>
> >> diff --git a/fs/io_uring.c b/fs/io_uring.c
> >> index a1ca6badff36..8ab4eb559520 100644
> >> --- a/fs/io_uring.c
> >> +++ b/fs/io_uring.c
> >> @@ -665,6 +665,13 @@ struct io_unlink {
> >>      struct filename                 *filename;
> >>  };
> >>
> >> +struct io_mkdir {
> >> +    struct file                     *file;
> >> +    int                             dfd;
> >> +    umode_t                         mode;
> >> +    struct filename                 *filename;
> >> +};
> >> +
> >>  struct io_completion {
> >>      struct file                     *file;
> >>      struct list_head                list;
> >> @@ -809,6 +816,7 @@ struct io_kiocb {
> >>              struct io_shutdown      shutdown;
> >>              struct io_rename        rename;
> >>              struct io_unlink        unlink;
> >> +            struct io_mkdir         mkdir;
> >>              /* use only after cleaning per-op data, see io_clean_op() */
> >>              struct io_completion    compl;
> >>      };
> >> @@ -1021,6 +1029,7 @@ static const struct io_op_def io_op_defs[] = {
> >>      },
> >>      [IORING_OP_RENAMEAT] = {},
> >>      [IORING_OP_UNLINKAT] = {},
> >> +    [IORING_OP_MKDIRAT] = {},
> >>  };
> >>
> >>  static bool io_disarm_next(struct io_kiocb *req);
> >> @@ -3530,6 +3539,44 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
> >>      return 0;
> >>  }
> >>
> >> +static int io_mkdirat_prep(struct io_kiocb *req,
> >> +                        const struct io_uring_sqe *sqe)
> >> +{
> >> +    struct io_mkdir *mkd = &req->mkdir;
> >> +    const char __user *fname;
> >> +
> >> +    if (unlikely(req->flags & REQ_F_FIXED_FILE))
> >> +            return -EBADF;
> >> +
> >> +    mkd->dfd = READ_ONCE(sqe->fd);
> >> +    mkd->mode = READ_ONCE(sqe->len);
> >> +
> >> +    fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
> >> +    mkd->filename = getname(fname);
> >> +    if (IS_ERR(mkd->filename))
> >> +            return PTR_ERR(mkd->filename);
> >
> > We have to check unused fields, e.g. buf_index and off,
> > to be able to use them in the future if needed.
> >
> > if (sqe->buf_index || sqe->off)
> >       return -EINVAL;
> >
> > Please double check what fields are not used, and
> > same goes for all other opcodes.

This changeset is based on some other ops that were added a while ago
(renameat, unlinkat), which lack the check as well. I guess I'll just go over
all of them and add the checks in a single patch if that's OK.

I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
the existing checks like this lack it too btw. I suppose I can fix those in a
separate commit if that makes sense.

>
> + opcode specific flags, e.g.
>
> if (sqe->rw_flags)
>         return -EINVAL;
Pavel Begunkov June 23, 2021, 11:53 a.m. UTC | #7
On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
> On Tue, Jun 22, 2021 at 6:50 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 6/22/21 12:41 PM, Pavel Begunkov wrote:
>>> On 6/3/21 6:18 AM, Dmitry Kadashev wrote:
>>>> IORING_OP_MKDIRAT behaves like mkdirat(2) and takes the same flags
>>>> and arguments.
>>>>
>>>> Signed-off-by: Dmitry Kadashev <dkadashev@gmail.com>
>>>> ---

[...]

>>> We have to check unused fields, e.g. buf_index and off,
>>> to be able to use them in the future if needed.
>>>
>>> if (sqe->buf_index || sqe->off)
>>>       return -EINVAL;
>>>
>>> Please double check what fields are not used, and
>>> same goes for all other opcodes.
> 
> This changeset is based on some other ops that were added a while ago
> (renameat, unlinkat), which lack the check as well. I guess I'll just go over

That's not great... Thanks for letting know

> all of them and add the checks in a single patch if that's OK.

For newly added opcodes a single patch on top is ok, rename and
unlink should be patched separately.

> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
> the existing checks like this lack it too btw. I suppose I can fix those in a
> separate commit if that makes sense.

When we really use a field there should be a READ_ONCE(),
but I wouldn't care about those we check for compatibility
reasons, but that's only my opinion.

>> + opcode specific flags, e.g.
>>
>> if (sqe->rw_flags)
>>         return -EINVAL;
>
Dmitry Kadashev June 24, 2021, 11:11 a.m. UTC | #8
On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
> > I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
> > the existing checks like this lack it too btw. I suppose I can fix those in a
> > separate commit if that makes sense.
>
> When we really use a field there should be a READ_ONCE(),
> but I wouldn't care about those we check for compatibility
> reasons, but that's only my opinion.

I'm not sure how the compatibility check reads are special. The code is
either correct or not. If a compatibility check has correctness problems
then it's pretty much as bad as any other part of the code having such
problems, no?

That said, I'll just go ahead and use the approach that the rest of the
code (or rather most of it) uses (no READ_ONCE). If it needs fixing then
the whole bunch can probably be fixed in one go (either a single patch
or a series).

Thanks for your help, Pavel!
Pavel Begunkov June 24, 2021, 12:21 p.m. UTC | #9
On 6/24/21 12:11 PM, Dmitry Kadashev wrote:
> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
>>> the existing checks like this lack it too btw. I suppose I can fix those in a
>>> separate commit if that makes sense.
>>
>> When we really use a field there should be a READ_ONCE(),
>> but I wouldn't care about those we check for compatibility
>> reasons, but that's only my opinion.
> 
> I'm not sure how the compatibility check reads are special. The code is
> either correct or not. If a compatibility check has correctness problems
> then it's pretty much as bad as any other part of the code having such
> problems, no?

If it reads and verifies a values first, e.g. index into some internal
array, and then compiler plays a joke and reloads it, we might be
absolutely screwed expecting 'segfaults', kernel data leakages and all
the fun stuff.

If that's a compatibility check, whether it's loaded earlier or later,
or whatever, it's not a big deal, the userspace can in any case change
the memory at any moment it wishes, even tightly around the moment
we're reading it.


> That said, I'll just go ahead and use the approach that the rest of the
> code (or rather most of it) uses (no READ_ONCE). If it needs fixing then
> the whole bunch can probably be fixed in one go (either a single patch
> or a series).
> 
> Thanks for your help, Pavel!
>
Dmitry Kadashev June 28, 2021, 8:17 a.m. UTC | #10
On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/24/21 12:11 PM, Dmitry Kadashev wrote:
> > On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
> >>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
> >>> the existing checks like this lack it too btw. I suppose I can fix those in a
> >>> separate commit if that makes sense.
> >>
> >> When we really use a field there should be a READ_ONCE(),
> >> but I wouldn't care about those we check for compatibility
> >> reasons, but that's only my opinion.
> >
> > I'm not sure how the compatibility check reads are special. The code is
> > either correct or not. If a compatibility check has correctness problems
> > then it's pretty much as bad as any other part of the code having such
> > problems, no?
>
> If it reads and verifies a values first, e.g. index into some internal
> array, and then compiler plays a joke and reloads it, we might be
> absolutely screwed expecting 'segfaults', kernel data leakages and all
> the fun stuff.
>
> If that's a compatibility check, whether it's loaded earlier or later,
> or whatever, it's not a big deal, the userspace can in any case change
> the memory at any moment it wishes, even tightly around the moment
> we're reading it.

Sorry for the slow reply, I have to balance this with my actual job that
is not directly related to the kernel development :)

I'm no kernel concurrency expert (actually I'm not any kind of kernel
expert), but my understanding is READ_ONCE does not just mean "do not
read more than once", but rather "read exactly once" (and more than
that), and if it's not applied then the compiler is within its rights to
optimize the read out, so the compatibility check can effectively be
disabled.

I don't think it's likely to happen, but "bad things do not happen in
practice" and "it is technically correct" are two different things :)

FWIW I'm not arguing it has to be changed, I just want to understand
things better (and if it helps to spot a bug at some point then great).
So if my reasoning is wrong then please point out where. And if it's
just the simplicity / clarity of the code that is the goal here and any
negative effects are considered to be unlikely then it's OK, I can
understand that.
Pavel Begunkov July 7, 2021, 2:06 p.m. UTC | #11
On 6/28/21 9:17 AM, Dmitry Kadashev wrote:
> On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> On 6/24/21 12:11 PM, Dmitry Kadashev wrote:
>>> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>>
>>>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
>>>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
>>>>> the existing checks like this lack it too btw. I suppose I can fix those in a
>>>>> separate commit if that makes sense.
>>>>
>>>> When we really use a field there should be a READ_ONCE(),
>>>> but I wouldn't care about those we check for compatibility
>>>> reasons, but that's only my opinion.
>>>
>>> I'm not sure how the compatibility check reads are special. The code is
>>> either correct or not. If a compatibility check has correctness problems
>>> then it's pretty much as bad as any other part of the code having such
>>> problems, no?
>>
>> If it reads and verifies a values first, e.g. index into some internal
>> array, and then compiler plays a joke and reloads it, we might be
>> absolutely screwed expecting 'segfaults', kernel data leakages and all
>> the fun stuff.
>>
>> If that's a compatibility check, whether it's loaded earlier or later,
>> or whatever, it's not a big deal, the userspace can in any case change
>> the memory at any moment it wishes, even tightly around the moment
>> we're reading it.
> 
> Sorry for the slow reply, I have to balance this with my actual job that
> is not directly related to the kernel development :)
> 
> I'm no kernel concurrency expert (actually I'm not any kind of kernel
> expert), but my understanding is READ_ONCE does not just mean "do not
> read more than once", but rather "read exactly once" (and more than
> that), and if it's not applied then the compiler is within its rights to
> optimize the read out, so the compatibility check can effectively be
> disabled.

Yep, as they say it's about all the "inventive" transformations
compilers can do, double read is just one of those that may turn very
nasty for us.

One big difference for me is whether it have a potential to crash the
kernel or not, though it's just one side.

Compilers can't drop the check just because, it first should be proven
to be safe to do, and there are all sorts barriers around and
limitations on how CQEs and SQEs are used, making impossible to alias
memory. E.g. CQEs and SQEs can't be reused in a single syscall, they're
only written and read respectively, and so on. Maybe, the only one I'd
worry about is the call to io_commit_sqring(), i.e. for SQE reads not
happening after it, but we need to take a look whether it's
theoretically possible.

> I don't think it's likely to happen, but "bad things do not happen in
> practice" and "it is technically correct" are two different things :)
> 
> FWIW I'm not arguing it has to be changed, I just want to understand
> things better (and if it helps to spot a bug at some point then great).
> So if my reasoning is wrong then please point out where. And if it's
> just the simplicity / clarity of the code that is the goal here and any
> negative effects are considered to be unlikely then it's OK, I can
> understand that.
Dmitry Kadashev July 12, 2021, 12:44 p.m. UTC | #12
On Wed, Jul 7, 2021 at 9:06 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/28/21 9:17 AM, Dmitry Kadashev wrote:
> > On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>
> >> On 6/24/21 12:11 PM, Dmitry Kadashev wrote:
> >>> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
> >>>>
> >>>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
> >>>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
> >>>>> the existing checks like this lack it too btw. I suppose I can fix those in a
> >>>>> separate commit if that makes sense.
> >>>>
> >>>> When we really use a field there should be a READ_ONCE(),
> >>>> but I wouldn't care about those we check for compatibility
> >>>> reasons, but that's only my opinion.
> >>>
> >>> I'm not sure how the compatibility check reads are special. The code is
> >>> either correct or not. If a compatibility check has correctness problems
> >>> then it's pretty much as bad as any other part of the code having such
> >>> problems, no?
> >>
> >> If it reads and verifies a values first, e.g. index into some internal
> >> array, and then compiler plays a joke and reloads it, we might be
> >> absolutely screwed expecting 'segfaults', kernel data leakages and all
> >> the fun stuff.
> >>
> >> If that's a compatibility check, whether it's loaded earlier or later,
> >> or whatever, it's not a big deal, the userspace can in any case change
> >> the memory at any moment it wishes, even tightly around the moment
> >> we're reading it.
> >
> > Sorry for the slow reply, I have to balance this with my actual job that
> > is not directly related to the kernel development :)
> >
> > I'm no kernel concurrency expert (actually I'm not any kind of kernel
> > expert), but my understanding is READ_ONCE does not just mean "do not
> > read more than once", but rather "read exactly once" (and more than
> > that), and if it's not applied then the compiler is within its rights to
> > optimize the read out, so the compatibility check can effectively be
> > disabled.
>
> Yep, as they say it's about all the "inventive" transformations
> compilers can do, double read is just one of those that may turn very
> nasty for us.
>
> One big difference for me is whether it have a potential to crash the
> kernel or not, though it's just one side.

Ah, that makes sense.

> Compilers can't drop the check just because, it first should be proven
> to be safe to do, and there are all sorts barriers around and
> limitations on how CQEs and SQEs are used, making impossible to alias
> memory. E.g. CQEs and SQEs can't be reused in a single syscall, they're
> only written and read respectively, and so on. Maybe, the only one I'd
> worry about is the call to io_commit_sqring(), i.e. for SQE reads not
> happening after it, but we need to take a look whether it's
> theoretically possible.

Thanks for the explanation, Pavel!
Pavel Begunkov July 12, 2021, 1:14 p.m. UTC | #13
On 7/12/21 1:44 PM, Dmitry Kadashev wrote:
> On Wed, Jul 7, 2021 at 9:06 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>> On 6/28/21 9:17 AM, Dmitry Kadashev wrote:
>>> On Thu, Jun 24, 2021 at 7:22 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>> On 6/24/21 12:11 PM, Dmitry Kadashev wrote:
>>>>> On Wed, Jun 23, 2021 at 6:54 PM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>>>>> On 6/23/21 7:41 AM, Dmitry Kadashev wrote:
>>>>>>> I'd imagine READ_ONCE is to be used in those checks though, isn't it? Some of
>>>>>>> the existing checks like this lack it too btw. I suppose I can fix those in a
>>>>>>> separate commit if that makes sense.
>>>>>>
>>>>>> When we really use a field there should be a READ_ONCE(),
>>>>>> but I wouldn't care about those we check for compatibility
>>>>>> reasons, but that's only my opinion.
>>>>>
>>>>> I'm not sure how the compatibility check reads are special. The code is
>>>>> either correct or not. If a compatibility check has correctness problems
>>>>> then it's pretty much as bad as any other part of the code having such
>>>>> problems, no?
>>>>
>>>> If it reads and verifies a values first, e.g. index into some internal
>>>> array, and then compiler plays a joke and reloads it, we might be
>>>> absolutely screwed expecting 'segfaults', kernel data leakages and all
>>>> the fun stuff.
>>>>
>>>> If that's a compatibility check, whether it's loaded earlier or later,
>>>> or whatever, it's not a big deal, the userspace can in any case change
>>>> the memory at any moment it wishes, even tightly around the moment
>>>> we're reading it.
>>>
>>> Sorry for the slow reply, I have to balance this with my actual job that
>>> is not directly related to the kernel development :)
>>>
>>> I'm no kernel concurrency expert (actually I'm not any kind of kernel
>>> expert), but my understanding is READ_ONCE does not just mean "do not
>>> read more than once", but rather "read exactly once" (and more than
>>> that), and if it's not applied then the compiler is within its rights to
>>> optimize the read out, so the compatibility check can effectively be
>>> disabled.
>>
>> Yep, as they say it's about all the "inventive" transformations
>> compilers can do, double read is just one of those that may turn very
>> nasty for us.
>>
>> One big difference for me is whether it have a potential to crash the
>> kernel or not, though it's just one side.
> 
> Ah, that makes sense.
> 
>> Compilers can't drop the check just because, it first should be proven
>> to be safe to do, and there are all sorts barriers around and
>> limitations on how CQEs and SQEs are used, making impossible to alias
>> memory. E.g. CQEs and SQEs can't be reused in a single syscall, they're
>> only written and read respectively, and so on. Maybe, the only one I'd
>> worry about is the call to io_commit_sqring(), i.e. for SQE reads not
>> happening after it, but we need to take a look whether it's
>> theoretically possible.
> 
> Thanks for the explanation, Pavel!

btw, that was for why we were rather reluctant about that. It could
be a good idea to add READ_ONCE as you mentioned, at least would be
less confusing.
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a1ca6badff36..8ab4eb559520 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -665,6 +665,13 @@  struct io_unlink {
 	struct filename			*filename;
 };
 
+struct io_mkdir {
+	struct file			*file;
+	int				dfd;
+	umode_t				mode;
+	struct filename			*filename;
+};
+
 struct io_completion {
 	struct file			*file;
 	struct list_head		list;
@@ -809,6 +816,7 @@  struct io_kiocb {
 		struct io_shutdown	shutdown;
 		struct io_rename	rename;
 		struct io_unlink	unlink;
+		struct io_mkdir		mkdir;
 		/* use only after cleaning per-op data, see io_clean_op() */
 		struct io_completion	compl;
 	};
@@ -1021,6 +1029,7 @@  static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_RENAMEAT] = {},
 	[IORING_OP_UNLINKAT] = {},
+	[IORING_OP_MKDIRAT] = {},
 };
 
 static bool io_disarm_next(struct io_kiocb *req);
@@ -3530,6 +3539,44 @@  static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static int io_mkdirat_prep(struct io_kiocb *req,
+			    const struct io_uring_sqe *sqe)
+{
+	struct io_mkdir *mkd = &req->mkdir;
+	const char __user *fname;
+
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	mkd->dfd = READ_ONCE(sqe->fd);
+	mkd->mode = READ_ONCE(sqe->len);
+
+	fname = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	mkd->filename = getname(fname);
+	if (IS_ERR(mkd->filename))
+		return PTR_ERR(mkd->filename);
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_mkdirat(struct io_kiocb *req, int issue_flags)
+{
+	struct io_mkdir *mkd = &req->mkdir;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode);
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (ret < 0)
+		req_set_fail_links(req);
+	io_req_complete(req, ret);
+	return 0;
+}
+
 static int io_shutdown_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -5936,6 +5983,8 @@  static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_renameat_prep(req, sqe);
 	case IORING_OP_UNLINKAT:
 		return io_unlinkat_prep(req, sqe);
+	case IORING_OP_MKDIRAT:
+		return io_mkdirat_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6077,6 +6126,9 @@  static void io_clean_op(struct io_kiocb *req)
 		case IORING_OP_UNLINKAT:
 			putname(req->unlink.filename);
 			break;
+		case IORING_OP_MKDIRAT:
+			putname(req->mkdir.filename);
+			break;
 		}
 		req->flags &= ~REQ_F_NEED_CLEANUP;
 	}
@@ -6203,6 +6255,9 @@  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_UNLINKAT:
 		ret = io_unlinkat(req, issue_flags);
 		break;
+	case IORING_OP_MKDIRAT:
+		ret = io_mkdirat(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index e1ae46683301..bf9d720d371f 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -137,6 +137,7 @@  enum {
 	IORING_OP_SHUTDOWN,
 	IORING_OP_RENAMEAT,
 	IORING_OP_UNLINKAT,
+	IORING_OP_MKDIRAT,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,