diff mbox series

[v10,4/5] io_uring: add fsetxattr and setxattr support

Message ID 20211229203002.4110839-5-shr@fb.com (mailing list archive)
State New, archived
Headers show
Series io_uring: add xattr support | expand

Commit Message

Stefan Roesch Dec. 29, 2021, 8:30 p.m. UTC
This adds support to io_uring for the fsetxattr and setxattr API.

Signed-off-by: Stefan Roesch <shr@fb.com>
Acked-by: Christian Brauner <christian.brauner@ubuntu.com>
---
 fs/io_uring.c                 | 164 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/io_uring.h |   6 +-
 2 files changed, 169 insertions(+), 1 deletion(-)

Comments

Al Viro Dec. 30, 2021, 1:58 a.m. UTC | #1
On Wed, Dec 29, 2021 at 12:30:01PM -0800, Stefan Roesch wrote:

> +static int io_setxattr_prep(struct io_kiocb *req,
> +			const struct io_uring_sqe *sqe)
> +{
> +	struct io_xattr *ix = &req->xattr;
> +	const char __user *path;
> +	int ret;
> +
> +	ret = __io_setxattr_prep(req, sqe);
> +	if (ret)
> +		return ret;
> +
> +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> +
> +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> +	if (IS_ERR(ix->filename)) {
> +		ret = PTR_ERR(ix->filename);
> +		ix->filename = NULL;
> +	}
> +
> +	return ret;
> +}

Same question as for getxattr side.  Why bother doing getname in prep
and open-coding the ESTALE retry loop in io_setxattr() proper?

Again, if you have hit the retry_estale() returning true, you are already
on a very slow path; trying to save on getname is completely pointless.
Moreover, had there been a situation where it might have been warranted
(and I really can't imagine one), why would that only be applicable in
io_uring case?
Al Viro Dec. 30, 2021, 2:17 a.m. UTC | #2
On Wed, Dec 29, 2021 at 12:30:01PM -0800, Stefan Roesch wrote:

> +static int __io_setxattr_prep(struct io_kiocb *req,
> +			const struct io_uring_sqe *sqe)
> +{
> +	struct io_xattr *ix = &req->xattr;
> +	const char __user *name;
> +	int ret;
> +
> +	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> +		return -EINVAL;
> +	if (unlikely(sqe->ioprio))
> +		return -EINVAL;
> +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
> +		return -EBADF;
> +
> +	ix->filename = NULL;
> +	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
> +	ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2));
> +	ix->ctx.kvalue = NULL;
> +	ix->ctx.size = READ_ONCE(sqe->len);
> +	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
> +
> +	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
> +	if (!ix->ctx.kname)
> +		return -ENOMEM;
> +
> +	ret = setxattr_copy(name, &ix->ctx);
> +	if (ret) {
> +		kfree(ix->ctx.kname);
> +		return ret;
> +	}
> +
> +	req->flags |= REQ_F_NEED_CLEANUP;
> +	return 0;
> +}

OK, so you
	* allocate a buffer for xattr name
	* have setxattr_copy() copy the name in *and* memdup the contents
	* on failure, you have the buffer for xattr name freed and return
an error.  memdup'ed stuff is left for cleanup, presumably.

> +static int io_setxattr_prep(struct io_kiocb *req,
> +			const struct io_uring_sqe *sqe)
> +{
> +	struct io_xattr *ix = &req->xattr;
> +	const char __user *path;
> +	int ret;
> +
> +	ret = __io_setxattr_prep(req, sqe);
> +	if (ret)
> +		return ret;
> +
> +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> +
> +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> +	if (IS_ERR(ix->filename)) {
> +		ret = PTR_ERR(ix->filename);
> +		ix->filename = NULL;
> +	}
> +
> +	return ret;
> +}

... and here you use it and bring the pathname in.  Should the latter
step fail, you restore ->filename to NULL and return an error.

Could you explain what kind of magic could allow the caller to tell
whether ix->ctx.kname needs to be freed on error?  I don't see any way
that could possibly work...
Al Viro Dec. 30, 2021, 2:19 a.m. UTC | #3
On Thu, Dec 30, 2021 at 02:17:23AM +0000, Al Viro wrote:

> > +static int io_setxattr_prep(struct io_kiocb *req,
> > +			const struct io_uring_sqe *sqe)
> > +{
> > +	struct io_xattr *ix = &req->xattr;
> > +	const char __user *path;
> > +	int ret;
> > +
> > +	ret = __io_setxattr_prep(req, sqe);
> > +	if (ret)
> > +		return ret;
> > +
> > +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> > +
> > +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> > +	if (IS_ERR(ix->filename)) {
> > +		ret = PTR_ERR(ix->filename);
> > +		ix->filename = NULL;
> > +	}
> > +
> > +	return ret;
> > +}
> 
> ... and here you use it and bring the pathname in.  Should the latter
> step fail, you restore ->filename to NULL and return an error.
> 
> Could you explain what kind of magic could allow the caller to tell
> whether ix->ctx.kname needs to be freed on error?  I don't see any way
> that could possibly work...

getxattr side has the same problem, AFAICS...
Al Viro Dec. 30, 2021, 3:04 a.m. UTC | #4
On Thu, Dec 30, 2021 at 02:17:23AM +0000, Al Viro wrote:
> On Wed, Dec 29, 2021 at 12:30:01PM -0800, Stefan Roesch wrote:
> 
> > +static int __io_setxattr_prep(struct io_kiocb *req,
> > +			const struct io_uring_sqe *sqe)
> > +{
> > +	struct io_xattr *ix = &req->xattr;
> > +	const char __user *name;
> > +	int ret;
> > +
> > +	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> > +		return -EINVAL;
> > +	if (unlikely(sqe->ioprio))
> > +		return -EINVAL;
> > +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
> > +		return -EBADF;
> > +
> > +	ix->filename = NULL;
> > +	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
> > +	ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2));
> > +	ix->ctx.kvalue = NULL;
> > +	ix->ctx.size = READ_ONCE(sqe->len);
> > +	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
> > +
> > +	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
> > +	if (!ix->ctx.kname)
> > +		return -ENOMEM;
> > +
> > +	ret = setxattr_copy(name, &ix->ctx);
> > +	if (ret) {
> > +		kfree(ix->ctx.kname);
> > +		return ret;
> > +	}
> > +
> > +	req->flags |= REQ_F_NEED_CLEANUP;
> > +	return 0;
> > +}
> 
> OK, so you
> 	* allocate a buffer for xattr name
> 	* have setxattr_copy() copy the name in *and* memdup the contents
> 	* on failure, you have the buffer for xattr name freed and return
> an error.  memdup'ed stuff is left for cleanup, presumably.
> 
> > +static int io_setxattr_prep(struct io_kiocb *req,
> > +			const struct io_uring_sqe *sqe)
> > +{
> > +	struct io_xattr *ix = &req->xattr;
> > +	const char __user *path;
> > +	int ret;
> > +
> > +	ret = __io_setxattr_prep(req, sqe);
> > +	if (ret)
> > +		return ret;
> > +
> > +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> > +
> > +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> > +	if (IS_ERR(ix->filename)) {
> > +		ret = PTR_ERR(ix->filename);
> > +		ix->filename = NULL;
> > +	}
> > +
> > +	return ret;
> > +}
> 
> ... and here you use it and bring the pathname in.  Should the latter
> step fail, you restore ->filename to NULL and return an error.
> 
> Could you explain what kind of magic could allow the caller to tell
> whether ix->ctx.kname needs to be freed on error?  I don't see any way
> that could possibly work...

FWIW, your calling conventions make no sense whatsoever.  OK, you have
a helper that does copyin of the arguments.  And it needs to be shared
between the syscall path (where you put the xattr name on stack) and
io_uring one (where you allocate it dynamically).  Why not simply move
the allocation into that helper, conditional upon the passed value being
NULL?  And leave it alone on any failure paths in that helper.

Syscall user would set it pointing to local structure/string/whatnot.
No freeing is needed there in any case.

io_uring one would set it to NULL and free the value left there on
cleanup.  Again, same in all cases, error or no error.  Just make sure
you have it zeroed *before* any failure exits (including those on req->flags,
etc.)

While we are at it, syscall path needs to free the copied xattr contents
anyway.  So screw freeing anything in that helper (both allocation failures
and copyin ones); have all freeing done by caller, and make it unconditional
there.  An error is usually a slow path; an error of that sort - definitely
so.  IOW,
	1) call the helper, copying userland data into the buffers allocated
by the helper
	2) if helper hasn't returned an error, do work
	3) free whatever the helper has allocated
With (3) being unconditional.  It doesn't make any sense to have a separate
early exit, especially since with your approach you end up paying the price
on failure exits in the helper anyway.

	error = setxattr_copy(...);
	if (likely(!error))
		error = do_setxattr(...);
	kvfree(...);
	return error;

would've been better for the syscall side as well.
Christian Brauner Dec. 30, 2021, 10:12 a.m. UTC | #5
On Thu, Dec 30, 2021 at 03:04:34AM +0000, Al Viro wrote:
> On Thu, Dec 30, 2021 at 02:17:23AM +0000, Al Viro wrote:
> > On Wed, Dec 29, 2021 at 12:30:01PM -0800, Stefan Roesch wrote:
> > 
> > > +static int __io_setxattr_prep(struct io_kiocb *req,
> > > +			const struct io_uring_sqe *sqe)
> > > +{
> > > +	struct io_xattr *ix = &req->xattr;
> > > +	const char __user *name;
> > > +	int ret;
> > > +
> > > +	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
> > > +		return -EINVAL;
> > > +	if (unlikely(sqe->ioprio))
> > > +		return -EINVAL;
> > > +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
> > > +		return -EBADF;
> > > +
> > > +	ix->filename = NULL;
> > > +	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
> > > +	ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2));
> > > +	ix->ctx.kvalue = NULL;
> > > +	ix->ctx.size = READ_ONCE(sqe->len);
> > > +	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
> > > +
> > > +	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
> > > +	if (!ix->ctx.kname)
> > > +		return -ENOMEM;
> > > +
> > > +	ret = setxattr_copy(name, &ix->ctx);
> > > +	if (ret) {
> > > +		kfree(ix->ctx.kname);
> > > +		return ret;
> > > +	}
> > > +
> > > +	req->flags |= REQ_F_NEED_CLEANUP;
> > > +	return 0;
> > > +}
> > 
> > OK, so you
> > 	* allocate a buffer for xattr name
> > 	* have setxattr_copy() copy the name in *and* memdup the contents
> > 	* on failure, you have the buffer for xattr name freed and return
> > an error.  memdup'ed stuff is left for cleanup, presumably.
> > 
> > > +static int io_setxattr_prep(struct io_kiocb *req,
> > > +			const struct io_uring_sqe *sqe)
> > > +{
> > > +	struct io_xattr *ix = &req->xattr;
> > > +	const char __user *path;
> > > +	int ret;
> > > +
> > > +	ret = __io_setxattr_prep(req, sqe);
> > > +	if (ret)
> > > +		return ret;
> > > +
> > > +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> > > +
> > > +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> > > +	if (IS_ERR(ix->filename)) {
> > > +		ret = PTR_ERR(ix->filename);
> > > +		ix->filename = NULL;
> > > +	}
> > > +
> > > +	return ret;
> > > +}
> > 
> > ... and here you use it and bring the pathname in.  Should the latter
> > step fail, you restore ->filename to NULL and return an error.
> > 
> > Could you explain what kind of magic could allow the caller to tell
> > whether ix->ctx.kname needs to be freed on error?  I don't see any way
> > that could possibly work...
> 
> FWIW, your calling conventions make no sense whatsoever.  OK, you have
> a helper that does copyin of the arguments.  And it needs to be shared
> between the syscall path (where you put the xattr name on stack) and
> io_uring one (where you allocate it dynamically).  Why not simply move
> the allocation into that helper, conditional upon the passed value being
> NULL?  And leave it alone on any failure paths in that helper.

I had thought about that too when looking at Stefan's code at first but
then concluded that doesn't make sense since io_uring doesn't allocate
xattr_ctx dynamically either. It embedds it directly in io_xattrs which
itself isn't allocated dynamically either.

But I think the unconditional cleanup you proposed makes sense. If we
add a simple static inline helper to internal.h to cleanup xattr_ctx
once the caller is done we can use that in __io_setxattr() and
setxattr(): 

From 248cae031c21d3103c8ab46afd729aa46114019a Mon Sep 17 00:00:00 2001
From: Christian Brauner <christian.brauner@ubuntu.com>
Date: Thu, 30 Dec 2021 11:02:57 +0100
Subject: [PATCH] UNTESTED

---
 fs/internal.h |  7 +++++++
 fs/io_uring.c | 11 +----------
 fs/xattr.c    | 10 ++++------
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 942b2005a2be..446dca46d845 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -228,5 +228,12 @@ ssize_t do_getxattr(struct user_namespace *mnt_userns,
 		    size_t size);
 
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
+static inline void setxattr_finish(struct xattr_ctx *ctx)
+{
+	kfree(ctx->kname);
+	kvfree(ctx->kvalue);
+	memset(ctx, 0, sizeof(struct xattr_ctx));
+}
+
 int do_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry,
 		struct xattr_ctx *ctx);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7204b8d593e4..2e30c7a87eb9 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4114,7 +4114,7 @@ static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
 		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
 		mnt_drop_write(path->mnt);
 	}
-
+	setxattr_finish(&ix->ctx);
 	return ret;
 }
 
@@ -4127,12 +4127,7 @@ static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
 		return -EAGAIN;
 
 	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
-
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	kfree(ix->ctx.kname);
-
-	if (ix->ctx.kvalue)
-		kvfree(ix->ctx.kvalue);
 	if (ret < 0)
 		req_set_fail(req);
 
@@ -4163,10 +4158,6 @@ static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
 	putname(ix->filename);
 
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	kfree(ix->ctx.kname);
-
-	if (ix->ctx.kvalue)
-		kvfree(ix->ctx.kvalue);
 	if (ret < 0)
 		req_set_fail(req);
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 3b6d683d07b9..0f4e067816bc 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -545,6 +545,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
 int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
 {
 	int error;
+	struct xattr_ctx *new_ctx;
 
 	if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
 		return -EINVAL;
@@ -606,12 +607,9 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
 	int error;
 
 	error = setxattr_copy(name, &ctx);
-	if (error)
-		return error;
-
-	error = do_setxattr(mnt_userns, d, &ctx);
-
-	kvfree(ctx.kvalue);
+	if (!error)
+		error = do_setxattr(mnt_userns, d, &ctx);
+	setxattr_finish(&ctx);
 	return error;
 }
Al Viro Dec. 30, 2021, 4:16 p.m. UTC | #6
On Thu, Dec 30, 2021 at 11:12:42AM +0100, Christian Brauner wrote:

> @@ -545,6 +545,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
>  int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
>  {
>  	int error;
> +	struct xattr_ctx *new_ctx;
>  
>  	if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
>  		return -EINVAL;
> @@ -606,12 +607,9 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
>  	int error;
>  
>  	error = setxattr_copy(name, &ctx);
> -	if (error)
> -		return error;
> -
> -	error = do_setxattr(mnt_userns, d, &ctx);
> -
> -	kvfree(ctx.kvalue);
> +	if (!error)
> +		error = do_setxattr(mnt_userns, d, &ctx);
> +	setxattr_finish(&ctx);
>  	return error;
>  }

Huh?  Have you lost a chunk or two in there?  The only modification of
setxattr_copy() in your delta is the introduction of an unused local
variable.  Confused...

What I had in mind is something like this:

// same for getxattr and setxattr
static int xattr_name_from_user(const char __user *name, struct xattr_ctx *ctx)
{
	int copied;

	if (!ctx->xattr_name) {
		ctx->xattr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL);
		if (!ctx->xattr_name)
			return -ENOMEM;
	}

	copied = strncpy_from_user(ctx->xattr_name, name, XATTR_NAME_MAX + 1);
 	if (copied < 0)
 		return copied;	// copyin failure; almost always -EFAULT
	if (copied == 0 || copied == XATTR_NAME_MAX + 1)
		return  -ERANGE;
	return 0;
}

// freeing is up to the caller, whether we succeed or not
int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
{
 	int error;

	if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
 		return -EINVAL;

	error = xattr_name_from_user(name, ctx);
 	if (error)
 		return error;

	if (ctx->size) {
		void *p;

		if (ctx->size > XATTR_SIZE_MAX)
 			return -E2BIG;

		p = vmemdup_user(ctx->value, ctx->size);
		if (IS_ERR(p))
			return PTR_ERR(p);
		ctx->kvalue = p;
 	}
	return 0;
}

with syscall side concluded with freeing ->kvalue (unconditionally), while
io_uring one - ->kvalue and ->xattr_name (also unconditionally).  And to
hell with struct xattr_name - a string is a string.

However, what I really want to see is the answer to my question re control
flow and the place where we do copy the arguments from userland.  Including
the pathname.

*IF* there's a subtle reason that has to be done from prep phase (and there
might very well be - figuring out the control flow in io_uring is bloody
painful), I would really like to see it spelled out, along with the explanation
of the reasons why statx() doesn't need anything of that sort.

If there's no such reasons, I would bloody well leave marshalling to the
payload, allowing to share a lot more with the syscall path.  In that
case xattr_ctx only needs to carry the userland pointers/size/flags.
And all that "do we allocate the kernel copy of the name dynamically,
or does it live on stack" simply goes away.

Details, please.
Christian Brauner Dec. 30, 2021, 6:01 p.m. UTC | #7
On Thu, Dec 30, 2021 at 04:16:34PM +0000, Al Viro wrote:
> On Thu, Dec 30, 2021 at 11:12:42AM +0100, Christian Brauner wrote:
> 
> > @@ -545,6 +545,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr);
> >  int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
> >  {
> >  	int error;
> > +	struct xattr_ctx *new_ctx;
> >  
> >  	if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
> >  		return -EINVAL;
> > @@ -606,12 +607,9 @@ setxattr(struct user_namespace *mnt_userns, struct dentry *d,
> >  	int error;
> >  
> >  	error = setxattr_copy(name, &ctx);
> > -	if (error)
> > -		return error;
> > -
> > -	error = do_setxattr(mnt_userns, d, &ctx);
> > -
> > -	kvfree(ctx.kvalue);
> > +	if (!error)
> > +		error = do_setxattr(mnt_userns, d, &ctx);
> > +	setxattr_finish(&ctx);
> >  	return error;
> >  }
> 
> Huh?  Have you lost a chunk or two in there?  The only modification of
> setxattr_copy() in your delta is the introduction of an unused local
> variable.  Confused...
> 
> What I had in mind is something like this:
> 
> // same for getxattr and setxattr
> static int xattr_name_from_user(const char __user *name, struct xattr_ctx *ctx)
> {
> 	int copied;
> 
> 	if (!ctx->xattr_name) {
> 		ctx->xattr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL);
> 		if (!ctx->xattr_name)
> 			return -ENOMEM;
> 	}
> 
> 	copied = strncpy_from_user(ctx->xattr_name, name, XATTR_NAME_MAX + 1);
>  	if (copied < 0)
>  		return copied;	// copyin failure; almost always -EFAULT
> 	if (copied == 0 || copied == XATTR_NAME_MAX + 1)
> 		return  -ERANGE;
> 	return 0;
> }
> 
> // freeing is up to the caller, whether we succeed or not
> int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
> {
>  	int error;
> 
> 	if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
>  		return -EINVAL;
> 
> 	error = xattr_name_from_user(name, ctx);
>  	if (error)
>  		return error;
> 
> 	if (ctx->size) {
> 		void *p;
> 
> 		if (ctx->size > XATTR_SIZE_MAX)
>  			return -E2BIG;
> 
> 		p = vmemdup_user(ctx->value, ctx->size);
> 		if (IS_ERR(p))
> 			return PTR_ERR(p);
> 		ctx->kvalue = p;
>  	}
> 	return 0;
> }
> 
> with syscall side concluded with freeing ->kvalue (unconditionally), while
> io_uring one - ->kvalue and ->xattr_name (also unconditionally).  And to
> hell with struct xattr_name - a string is a string.

Uhm, it wasn't obvious at all that you were just talking about
attr_ctx->kname. At least to me. I thought you were saying to allocate
struct xattr_ctx dynamically for io_uring and have it static for the
syscall path. Anyway, that change seems sensible to me. I don't care
much if there's a separate struct xattr_name or not.

> 
> However, what I really want to see is the answer to my question re control
> flow and the place where we do copy the arguments from userland.  Including
> the pathname.
> 
> *IF* there's a subtle reason that has to be done from prep phase (and there
> might very well be - figuring out the control flow in io_uring is bloody
> painful), I would really like to see it spelled out, along with the explanation
> of the reasons why statx() doesn't need anything of that sort.
> 
> If there's no such reasons, I would bloody well leave marshalling to the

That's really something the io_uring folks should explain to us. I can't
help much there either apart from what I can gather from looking through
the io_req_prep() switch.

Where it's clear that nearly all syscall-operations immediately do a
getname() and/or copy their arguments in the *_prep() phase as, not in
the actual "do-the-work" phase. For example, io_epoll_ctl_prep() which
copies struct epoll_event via copy_from_user(). It doesn't do it in
io_epoll_ctl(). So as such io_statx_prep() is the outlier...
Jens Axboe Dec. 30, 2021, 7:09 p.m. UTC | #8
On 12/30/21 10:01 AM, Christian Brauner wrote:
>> However, what I really want to see is the answer to my question re control
>> flow and the place where we do copy the arguments from userland.  Including
>> the pathname.
>>
>> *IF* there's a subtle reason that has to be done from prep phase (and there
>> might very well be - figuring out the control flow in io_uring is bloody
>> painful), I would really like to see it spelled out, along with the explanation
>> of the reasons why statx() doesn't need anything of that sort.
>>
>> If there's no such reasons, I would bloody well leave marshalling to the
> 
> That's really something the io_uring folks should explain to us. I can't
> help much there either apart from what I can gather from looking through
> the io_req_prep() switch.
> 
> Where it's clear that nearly all syscall-operations immediately do a
> getname() and/or copy their arguments in the *_prep() phase as, not in
> the actual "do-the-work" phase. For example, io_epoll_ctl_prep() which
> copies struct epoll_event via copy_from_user(). It doesn't do it in
> io_epoll_ctl(). So as such io_statx_prep() is the outlier...

For each command, there are two steps:

- The prep of it, this happens inline from the system call where the
  request, or requests, are submitted. The prep phase should ensure that
  argument structs are stable. Hence a caller can prep a request and
  have memory on stack, as long as it submits before it becomes invalid.
  An example of that are iovecs for readv/writev. The caller does not
  need to have them stable for the duration of the request, just across
  submit. That's the io_${cmd}_prep() helpers.

- The execution of it. May be separate from prep and from an async
  worker. Where the lower layers don't support a nonblocking attempt,
  they are always done async. The statx stuff is an example of that.

Hence prep needs to copy from userland on the prep side always for the
statx family, as execution will happen out-of-line from the submission.

Does that explain it?
Stefan Roesch Dec. 30, 2021, 8:18 p.m. UTC | #9
On 12/29/21 6:17 PM, Al Viro wrote:
> On Wed, Dec 29, 2021 at 12:30:01PM -0800, Stefan Roesch wrote:
> 
>> +static int __io_setxattr_prep(struct io_kiocb *req,
>> +			const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_xattr *ix = &req->xattr;
>> +	const char __user *name;
>> +	int ret;
>> +
>> +	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
>> +		return -EINVAL;
>> +	if (unlikely(sqe->ioprio))
>> +		return -EINVAL;
>> +	if (unlikely(req->flags & REQ_F_FIXED_FILE))
>> +		return -EBADF;
>> +
>> +	ix->filename = NULL;
>> +	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
>> +	ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2));
>> +	ix->ctx.kvalue = NULL;
>> +	ix->ctx.size = READ_ONCE(sqe->len);
>> +	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
>> +
>> +	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
>> +	if (!ix->ctx.kname)
>> +		return -ENOMEM;
>> +
>> +	ret = setxattr_copy(name, &ix->ctx);
>> +	if (ret) {
>> +		kfree(ix->ctx.kname);
>> +		return ret;
>> +	}
>> +
>> +	req->flags |= REQ_F_NEED_CLEANUP;
>> +	return 0;
>> +}
> 
> OK, so you
> 	* allocate a buffer for xattr name
> 	* have setxattr_copy() copy the name in *and* memdup the contents
> 	* on failure, you have the buffer for xattr name freed and return
> an error.  memdup'ed stuff is left for cleanup, presumably.
> 
>> +static int io_setxattr_prep(struct io_kiocb *req,
>> +			const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_xattr *ix = &req->xattr;
>> +	const char __user *path;
>> +	int ret;
>> +
>> +	ret = __io_setxattr_prep(req, sqe);
>> +	if (ret)
>> +		return ret;
>> +
>> +	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
>> +
>> +	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
>> +	if (IS_ERR(ix->filename)) {
>> +		ret = PTR_ERR(ix->filename);
>> +		ix->filename = NULL;
>> +	}
>> +
>> +	return ret;
>> +}
> 
> ... and here you use it and bring the pathname in.  Should the latter
> step fail, you restore ->filename to NULL and return an error.
> 
> Could you explain what kind of magic could allow the caller to tell
> whether ix->ctx.kname needs to be freed on error?  I don't see any way
> that could possibly work...

At the end of the function __io_setxattr_prep() we set the flag REQ_F_NEED_CLEANUP.

If the processing fails for some reason, the cleanup code in io_clean_op() gets called
and the data structures get de-allocated.

In case the request is processed successfully, the memory gets de-allocated in io_setxattr()
and io_fsetxattr() with the helper function __io_setxattr_finish(). The helper function clears
the flag REQ_F_NEED_CLEANUP, so clean up is not necessary.

This is the general pattern of cleanup in io-uring.

I can certainly add a cleanup function, that is called in all 3 cases:
- io_setxattr,
- io_fsetxattr
- io_clean_op
Al Viro Dec. 30, 2021, 10:24 p.m. UTC | #10
On Thu, Dec 30, 2021 at 11:09:12AM -0800, Jens Axboe wrote:

> For each command, there are two steps:
> 
> - The prep of it, this happens inline from the system call where the
>   request, or requests, are submitted. The prep phase should ensure that
>   argument structs are stable. Hence a caller can prep a request and
>   have memory on stack, as long as it submits before it becomes invalid.
>   An example of that are iovecs for readv/writev. The caller does not
>   need to have them stable for the duration of the request, just across
>   submit. That's the io_${cmd}_prep() helpers.
> 
> - The execution of it. May be separate from prep and from an async
>   worker. Where the lower layers don't support a nonblocking attempt,
>   they are always done async. The statx stuff is an example of that.
> 
> Hence prep needs to copy from userland on the prep side always for the
> statx family, as execution will happen out-of-line from the submission.
> 
> Does that explain it?

The actual call chain leading to filename_lookup() is, AFAICS, this:
	io_statx()
		do_statx()
			vfs_statx()
				user_path_at()
					user_path_at_empty()
						filename_lookup()

If you are providing such warranties for the contents of pathname
arguments, you have a bug in statx in the mainline.  If you are not,
there's no point in doing getname() in getxattr prep.

Which one it is?
Jens Axboe Dec. 30, 2021, 10:46 p.m. UTC | #11
On 12/30/21 2:24 PM, Al Viro wrote:
> On Thu, Dec 30, 2021 at 11:09:12AM -0800, Jens Axboe wrote:
> 
>> For each command, there are two steps:
>>
>> - The prep of it, this happens inline from the system call where the
>>   request, or requests, are submitted. The prep phase should ensure that
>>   argument structs are stable. Hence a caller can prep a request and
>>   have memory on stack, as long as it submits before it becomes invalid.
>>   An example of that are iovecs for readv/writev. The caller does not
>>   need to have them stable for the duration of the request, just across
>>   submit. That's the io_${cmd}_prep() helpers.
>>
>> - The execution of it. May be separate from prep and from an async
>>   worker. Where the lower layers don't support a nonblocking attempt,
>>   they are always done async. The statx stuff is an example of that.
>>
>> Hence prep needs to copy from userland on the prep side always for the
>> statx family, as execution will happen out-of-line from the submission.
>>
>> Does that explain it?
> 
> The actual call chain leading to filename_lookup() is, AFAICS, this:
> 	io_statx()
> 		do_statx()
> 			vfs_statx()
> 				user_path_at()
> 					user_path_at_empty()
> 						filename_lookup()
> 
> If you are providing such warranties for the contents of pathname
> arguments, you have a bug in statx in the mainline.  If you are not,
> there's no point in doing getname() in getxattr prep.

Not for the filename lookup, as I said it's for data passed in. There
are no guarantees on filename lookup, that happens when it gets
executed. See mentioned example on iovec and readv/writev.
Al Viro Dec. 30, 2021, 11:02 p.m. UTC | #12
On Thu, Dec 30, 2021 at 02:46:49PM -0800, Jens Axboe wrote:
> On 12/30/21 2:24 PM, Al Viro wrote:
> > On Thu, Dec 30, 2021 at 11:09:12AM -0800, Jens Axboe wrote:
> > 
> >> For each command, there are two steps:
> >>
> >> - The prep of it, this happens inline from the system call where the
> >>   request, or requests, are submitted. The prep phase should ensure that
> >>   argument structs are stable. Hence a caller can prep a request and
> >>   have memory on stack, as long as it submits before it becomes invalid.
> >>   An example of that are iovecs for readv/writev. The caller does not
> >>   need to have them stable for the duration of the request, just across
> >>   submit. That's the io_${cmd}_prep() helpers.
> >>
> >> - The execution of it. May be separate from prep and from an async
> >>   worker. Where the lower layers don't support a nonblocking attempt,
> >>   they are always done async. The statx stuff is an example of that.
> >>
> >> Hence prep needs to copy from userland on the prep side always for the
> >> statx family, as execution will happen out-of-line from the submission.
> >>
> >> Does that explain it?
> > 
> > The actual call chain leading to filename_lookup() is, AFAICS, this:
> > 	io_statx()
> > 		do_statx()
> > 			vfs_statx()
> > 				user_path_at()
> > 					user_path_at_empty()
> > 						filename_lookup()
> > 
> > If you are providing such warranties for the contents of pathname
> > arguments, you have a bug in statx in the mainline.  If you are not,
> > there's no point in doing getname() in getxattr prep.
> 
> Not for the filename lookup, as I said it's for data passed in. There
> are no guarantees on filename lookup, that happens when it gets
> executed. See mentioned example on iovec and readv/writev.

s/filename_lookup/getname_flags/, sorry.

Again, statx support does both the copyin and pathname resolution *after*
prep, from io_statx().  They are not separated - io_statx() pass the userland
pointer to user_path_at_empty(), which does all the work.  So if a pathname
you'd passed had been in a local array and you return right after submitting
a request, you will end up with io_statx() fetching random garbage.

This patchset is different - for getxattr you have getname done in prep,
with resulting struct filename kept around until the actual work is to
be done.  That's precisely the reason why the first patch in the series
introduces a user_path_at_empty() variant that takes a struct filename,
with the pathname contents already copied in.

IOW, why is user_path_at_empty() good for statx, but not for getxattr?
What's the difference?

Do you treat the pathname contents (string in userland memory, that is)
same way your writev support treats iovec array (caller may discard it
as soon as syscall returns) or the same way it treats the actual data
to be written (caller is responsible for keeping it around until the
operation reports completion)?
diff mbox series

Patch

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c8258c784116..f1b325dd81d5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -82,6 +82,7 @@ 
 #include <linux/audit.h>
 #include <linux/security.h>
 #include <linux/atomic-ref.h>
+#include <linux/xattr.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -726,6 +727,12 @@  struct io_async_rw {
 	struct wait_page_queue		wpq;
 };
 
+struct io_xattr {
+	struct file			*file;
+	struct xattr_ctx		ctx;
+	struct filename			*filename;
+};
+
 enum {
 	REQ_F_FIXED_FILE_BIT	= IOSQE_FIXED_FILE_BIT,
 	REQ_F_IO_DRAIN_BIT	= IOSQE_IO_DRAIN_BIT,
@@ -866,6 +873,7 @@  struct io_kiocb {
 		struct io_symlink	symlink;
 		struct io_hardlink	hardlink;
 		struct io_getdents	getdents;
+		struct io_xattr		xattr;
 	};
 
 	u8				opcode;
@@ -1118,6 +1126,10 @@  static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_GETDENTS] = {
 		.needs_file		= 1,
 	},
+	[IORING_OP_FSETXATTR] = {
+		.needs_file = 1
+	},
+	[IORING_OP_SETXATTR] = {},
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -3887,6 +3899,139 @@  static int io_renameat(struct io_kiocb *req, unsigned int issue_flags)
 	return 0;
 }
 
+static int __io_setxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = &req->xattr;
+	const char __user *name;
+	int ret;
+
+	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
+		return -EINVAL;
+	if (unlikely(sqe->ioprio))
+		return -EINVAL;
+	if (unlikely(req->flags & REQ_F_FIXED_FILE))
+		return -EBADF;
+
+	ix->filename = NULL;
+	name = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+	ix->ctx.kvalue = NULL;
+	ix->ctx.size = READ_ONCE(sqe->len);
+	ix->ctx.flags = READ_ONCE(sqe->xattr_flags);
+
+	ix->ctx.kname = kmalloc(sizeof(*ix->ctx.kname), GFP_KERNEL);
+	if (!ix->ctx.kname)
+		return -ENOMEM;
+
+	ret = setxattr_copy(name, &ix->ctx);
+	if (ret) {
+		kfree(ix->ctx.kname);
+		return ret;
+	}
+
+	req->flags |= REQ_F_NEED_CLEANUP;
+	return 0;
+}
+
+static int io_setxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	struct io_xattr *ix = &req->xattr;
+	const char __user *path;
+	int ret;
+
+	ret = __io_setxattr_prep(req, sqe);
+	if (ret)
+		return ret;
+
+	path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
+
+	ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
+	if (IS_ERR(ix->filename)) {
+		ret = PTR_ERR(ix->filename);
+		ix->filename = NULL;
+	}
+
+	return ret;
+}
+
+static int io_fsetxattr_prep(struct io_kiocb *req,
+			const struct io_uring_sqe *sqe)
+{
+	return __io_setxattr_prep(req, sqe);
+}
+
+static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags,
+			struct path *path)
+{
+	struct io_xattr *ix = &req->xattr;
+	int ret;
+
+	ret = mnt_want_write(path->mnt);
+	if (!ret) {
+		ret = do_setxattr(mnt_user_ns(path->mnt), path->dentry, &ix->ctx);
+		mnt_drop_write(path->mnt);
+	}
+
+	return ret;
+}
+
+static void __io_setxattr_finish(struct io_kiocb *req, int ret)
+{
+	struct xattr_ctx *ctx = &req->xattr.ctx;
+
+	req->flags &= ~REQ_F_NEED_CLEANUP;
+
+	kfree(ctx->kname);
+	if (ctx->kvalue)
+		kvfree(ctx->kvalue);
+
+	if (ret < 0)
+		req_set_fail(req);
+
+	io_req_complete(req, ret);
+}
+
+static int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ret = __io_setxattr(req, issue_flags, &req->file->f_path);
+	__io_setxattr_finish(req, ret);
+
+	return 0;
+}
+
+static int io_setxattr(struct io_kiocb *req, unsigned int issue_flags)
+{
+	struct io_xattr *ix = &req->xattr;
+	unsigned int lookup_flags = LOOKUP_FOLLOW;
+	struct path path;
+	int ret;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+retry:
+	ret = do_user_path_at_empty(AT_FDCWD, ix->filename, lookup_flags, &path);
+	if (!ret) {
+		ret = __io_setxattr(req, issue_flags, &path);
+		path_put(&path);
+		if (retry_estale(ret, lookup_flags)) {
+			lookup_flags |= LOOKUP_REVAL;
+			goto retry;
+		}
+	}
+	putname(ix->filename);
+
+	__io_setxattr_finish(req, ret);
+	return 0;
+}
+
 static int io_unlinkat_prep(struct io_kiocb *req,
 			    const struct io_uring_sqe *sqe)
 {
@@ -6623,6 +6768,10 @@  static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return io_linkat_prep(req, sqe);
 	case IORING_OP_GETDENTS:
 		return io_getdents_prep(req, sqe);
+	case IORING_OP_FSETXATTR:
+		return io_fsetxattr_prep(req, sqe);
+	case IORING_OP_SETXATTR:
+		return io_setxattr_prep(req, sqe);
 	}
 
 	printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n",
@@ -6764,6 +6913,14 @@  static void io_clean_op(struct io_kiocb *req)
 			putname(req->hardlink.oldpath);
 			putname(req->hardlink.newpath);
 			break;
+		case IORING_OP_SETXATTR:
+			if (req->xattr.filename)
+				putname(req->xattr.filename);
+			fallthrough;
+		case IORING_OP_FSETXATTR:
+			kfree(req->xattr.ctx.kname);
+			kvfree(req->xattr.ctx.kvalue);
+			break;
 		}
 	}
 	if ((req->flags & REQ_F_POLLED) && req->apoll) {
@@ -6909,6 +7066,12 @@  static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
 	case IORING_OP_GETDENTS:
 		ret = io_getdents(req, issue_flags);
 		break;
+	case IORING_OP_FSETXATTR:
+		ret = io_fsetxattr(req, issue_flags);
+		break;
+	case IORING_OP_SETXATTR:
+		ret = io_setxattr(req, issue_flags);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -11277,6 +11440,7 @@  static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
+	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
 
 	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
 		     sizeof(struct io_uring_rsrc_update));
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 57dc88db5793..c62a8bec8cd4 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -45,6 +45,7 @@  struct io_uring_sqe {
 		__u32		rename_flags;
 		__u32		unlink_flags;
 		__u32		hardlink_flags;
+		__u32		xattr_flags;
 	};
 	__u64	user_data;	/* data to be passed back at completion time */
 	/* pack this to avoid bogus arm OABI complaints */
@@ -60,7 +61,8 @@  struct io_uring_sqe {
 		__s32	splice_fd_in;
 		__u32	file_index;
 	};
-	__u64	__pad2[2];
+	__u64	addr3;
+	__u64	__pad2[1];
 };
 
 enum {
@@ -144,6 +146,8 @@  enum {
 	IORING_OP_SYMLINKAT,
 	IORING_OP_LINKAT,
 	IORING_OP_GETDENTS,
+	IORING_OP_FSETXATTR,
+	IORING_OP_SETXATTR,
 
 	/* this goes last, obviously */
 	IORING_OP_LAST,