diff mbox series

[v2] fs: create kiocb_{start,end}_write() helpers

Message ID 20230816085439.894112-1-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v2] fs: create kiocb_{start,end}_write() helpers | expand

Commit Message

Amir Goldstein Aug. 16, 2023, 8:54 a.m. UTC
aio, io_uring, cachefiles and overlayfs, all open code an ugly variant
of file_{start,end}_write() to silence lockdep warnings.

Create helpers for this lockdep dance and use the helpers in all the
callers.

Use a new iocb flag IOCB_WRITE_STARTED to indicate if sb_start_write()
was called.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---

Christian,

This is an attempt to consolidate the open coded lockdep fooling in
all those async io submitters into a single helper.
The idea to do that consolidation was suggested by Jan.
The (questionable) idea to use an IOCB_ flag was mine.

This re-factoring is part of a larger vfs cleanup I am doing for
fanotify permission events.  The complete series is not ready for
prime time yet, but this one patch is independent and I would love
to get it reviewed/merged a head of the rest.

Thanks,
Amir.

Changes since v1:
- Leave ISREG checks in callers (Jens)
- Leave setting IOCB_WRITE flag in callers

 fs/aio.c            | 26 +++----------------
 fs/cachefiles/io.c  | 16 +++---------
 fs/overlayfs/file.c | 15 +++++------
 include/linux/fs.h  | 62 +++++++++++++++++++++++++++++++++++++++++++--
 io_uring/rw.c       | 36 +++++---------------------
 5 files changed, 79 insertions(+), 76 deletions(-)

Comments

Jan Kara Aug. 16, 2023, 3:01 p.m. UTC | #1
On Wed 16-08-23 11:54:39, Amir Goldstein wrote:
> aio, io_uring, cachefiles and overlayfs, all open code an ugly variant
> of file_{start,end}_write() to silence lockdep warnings.
> 
> Create helpers for this lockdep dance and use the helpers in all the
> callers.
> 
> Use a new iocb flag IOCB_WRITE_STARTED to indicate if sb_start_write()
> was called.
> 
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

Yeah, looks like a good cleanup to me. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza


> ---
> 
> Christian,
> 
> This is an attempt to consolidate the open coded lockdep fooling in
> all those async io submitters into a single helper.
> The idea to do that consolidation was suggested by Jan.
> The (questionable) idea to use an IOCB_ flag was mine.
> 
> This re-factoring is part of a larger vfs cleanup I am doing for
> fanotify permission events.  The complete series is not ready for
> prime time yet, but this one patch is independent and I would love
> to get it reviewed/merged a head of the rest.
> 
> Thanks,
> Amir.
> 
> Changes since v1:
> - Leave ISREG checks in callers (Jens)
> - Leave setting IOCB_WRITE flag in callers
> 
>  fs/aio.c            | 26 +++----------------
>  fs/cachefiles/io.c  | 16 +++---------
>  fs/overlayfs/file.c | 15 +++++------
>  include/linux/fs.h  | 62 +++++++++++++++++++++++++++++++++++++++++++--
>  io_uring/rw.c       | 36 +++++---------------------
>  5 files changed, 79 insertions(+), 76 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index 77e33619de40..16fb3ac2093b 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1444,17 +1444,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
>  	if (!list_empty_careful(&iocb->ki_list))
>  		aio_remove_iocb(iocb);
>  
> -	if (kiocb->ki_flags & IOCB_WRITE) {
> -		struct inode *inode = file_inode(kiocb->ki_filp);
> -
> -		/*
> -		 * Tell lockdep we inherited freeze protection from submission
> -		 * thread.
> -		 */
> -		if (S_ISREG(inode->i_mode))
> -			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
> -		file_end_write(kiocb->ki_filp);
> -	}
> +	if (kiocb->ki_flags & IOCB_WRITE)
> +		kiocb_end_write(kiocb);
>  
>  	iocb->ki_res.res = res;
>  	iocb->ki_res.res2 = 0;
> @@ -1581,17 +1572,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
>  		return ret;
>  	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
>  	if (!ret) {
> -		/*
> -		 * Open-code file_start_write here to grab freeze protection,
> -		 * which will be released by another thread in
> -		 * aio_complete_rw().  Fool lockdep by telling it the lock got
> -		 * released so that it doesn't complain about the held lock when
> -		 * we return to userspace.
> -		 */
> -		if (S_ISREG(file_inode(file)->i_mode)) {
> -			sb_start_write(file_inode(file)->i_sb);
> -			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
> -		}
> +		if (S_ISREG(file_inode(file)->i_mode))
> +			kiocb_start_write(req);
>  		req->ki_flags |= IOCB_WRITE;
>  		aio_rw_done(req, call_write_iter(file, req, &iter));
>  	}
> diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
> index 175a25fcade8..6e16f7c116da 100644
> --- a/fs/cachefiles/io.c
> +++ b/fs/cachefiles/io.c
> @@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
>  
>  	_enter("%ld", ret);
>  
> -	/* Tell lockdep we inherited freeze protection from submission thread */
> -	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
> -	__sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
> +	kiocb_end_write(iocb);
>  
>  	if (ret < 0)
>  		trace_cachefiles_io_error(object, inode, ret,
> @@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object,
>  {
>  	struct cachefiles_cache *cache;
>  	struct cachefiles_kiocb *ki;
> -	struct inode *inode;
>  	unsigned int old_nofs;
>  	ssize_t ret;
>  	size_t len = iov_iter_count(iter);
> @@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object,
>  		ki->iocb.ki_complete = cachefiles_write_complete;
>  	atomic_long_add(ki->b_writing, &cache->b_writing);
>  
> -	/* Open-code file_start_write here to grab freeze protection, which
> -	 * will be released by another thread in aio_complete_rw().  Fool
> -	 * lockdep by telling it the lock got released so that it doesn't
> -	 * complain about the held lock when we return to userspace.
> -	 */
> -	inode = file_inode(file);
> -	__sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
> -	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
> +	kiocb_start_write(ki);
>  
>  	get_file(ki->iocb.ki_filp);
>  	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
>  
> -	trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
> +	trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len);
>  	old_nofs = memalloc_nofs_save();
>  	ret = cachefiles_inject_write_error();
>  	if (ret == 0)
> diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
> index 21245b00722a..c756edb9bd4c 100644
> --- a/fs/overlayfs/file.c
> +++ b/fs/overlayfs/file.c
> @@ -290,10 +290,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
>  	if (iocb->ki_flags & IOCB_WRITE) {
>  		struct inode *inode = file_inode(orig_iocb->ki_filp);
>  
> -		/* Actually acquired in ovl_write_iter() */
> -		__sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
> -				      SB_FREEZE_WRITE);
> -		file_end_write(iocb->ki_filp);
> +		kiocb_end_write(iocb);
>  		ovl_copyattr(inode);
>  	}
>  
> @@ -362,6 +359,9 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
>  	return ret;
>  }
>  
> +/* IOCB flags that may be propagated to real file io */
> +#define OVL_IOCB_MASK ~(IOCB_WRITE_STARTED)
> +
>  static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
>  {
>  	struct file *file = iocb->ki_filp;
> @@ -369,7 +369,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
>  	struct fd real;
>  	const struct cred *old_cred;
>  	ssize_t ret;
> -	int ifl = iocb->ki_flags;
> +	int ifl = iocb->ki_flags & OVL_IOCB_MASK;
>  
>  	if (!iov_iter_count(iter))
>  		return 0;
> @@ -409,10 +409,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
>  		if (!aio_req)
>  			goto out;
>  
> -		file_start_write(real.file);
> -		/* Pacify lockdep, same trick as done in aio_write() */
> -		__sb_writers_release(file_inode(real.file)->i_sb,
> -				     SB_FREEZE_WRITE);
>  		aio_req->fd = real;
>  		real.flags = 0;
>  		aio_req->orig_iocb = iocb;
> @@ -420,6 +416,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
>  		aio_req->iocb.ki_flags = ifl;
>  		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
>  		refcount_set(&aio_req->ref, 2);
> +		kiocb_start_write(&aio_req->iocb);
>  		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
>  		ovl_aio_put(aio_req);
>  		if (ret != -EIOCBQUEUED)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index b2adee67f9b2..8e5d410a1be5 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -338,6 +338,8 @@ enum rw_hint {
>  #define IOCB_NOIO		(1 << 20)
>  /* can use bio alloc cache */
>  #define IOCB_ALLOC_CACHE	(1 << 21)
> +/* file_start_write() was called */
> +#define IOCB_WRITE_STARTED	(1 << 22)
>  
>  /* for use in trace events */
>  #define TRACE_IOCB_STRINGS \
> @@ -351,7 +353,8 @@ enum rw_hint {
>  	{ IOCB_WRITE,		"WRITE" }, \
>  	{ IOCB_WAITQ,		"WAITQ" }, \
>  	{ IOCB_NOIO,		"NOIO" }, \
> -	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }
> +	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
> +	{ IOCB_WRITE_STARTED,	"WRITE_STARTED" }
>  
>  struct kiocb {
>  	struct file		*ki_filp;
> @@ -2545,6 +2548,13 @@ static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
>  	return (inode->i_mode ^ mode) & S_IFMT;
>  }
>  
> +/**
> + * file_start_write - get write access to a superblock for regular file io
> + * @file: the file we want to write to
> + *
> + * This is a variant of sb_start_write() which is a noop on non-regualr file.
> + * Should be matched with a call to file_end_write().
> + */
>  static inline void file_start_write(struct file *file)
>  {
>  	if (!S_ISREG(file_inode(file)->i_mode))
> @@ -2559,11 +2569,59 @@ static inline bool file_start_write_trylock(struct file *file)
>  	return sb_start_write_trylock(file_inode(file)->i_sb);
>  }
>  
> +/**
> + * file_end_write - drop write access to a superblock of a regular file
> + * @file: the file we wrote to
> + *
> + * Should be matched with a call to file_start_write().
> + */
>  static inline void file_end_write(struct file *file)
>  {
>  	if (!S_ISREG(file_inode(file)->i_mode))
>  		return;
> -	__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
> +	sb_end_write(file_inode(file)->i_sb);
> +}
> +
> +/**
> + * kiocb_start_write - get write access to a superblock for async file io
> + * @iocb: the io context we want to submit the write with
> + *
> + * This is a variant of file_start_write() for async io submission.
> + * Should be matched with a call to kiocb_end_write().
> + */
> +static inline void kiocb_start_write(struct kiocb *iocb)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (WARN_ON_ONCE(iocb->ki_flags & IOCB_WRITE_STARTED))
> +		return;
> +	sb_start_write(inode->i_sb);
> +	/*
> +	 * Fool lockdep by telling it the lock got released so that it
> +	 * doesn't complain about the held lock when we return to userspace.
> +	 */
> +	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
> +	iocb->ki_flags |= IOCB_WRITE_STARTED;
> +}
> +
> +/**
> + * kiocb_end_write - drop write access to a superblock after async file io
> + * @iocb: the io context we sumbitted the write with
> + *
> + * Should be matched with a call to kiocb_start_write().
> + */
> +static inline void kiocb_end_write(struct kiocb *iocb)
> +{
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	if (!(iocb->ki_flags & IOCB_WRITE_STARTED))
> +		return;
> +	/*
> +	 * Tell lockdep we inherited freeze protection from submission thread.
> +	 */
> +	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
> +	sb_end_write(inode->i_sb);
> +	iocb->ki_flags &= ~IOCB_WRITE_STARTED;
>  }
>  
>  /*
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index 1bce2208b65c..362d48493096 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -220,20 +220,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
>  }
>  #endif
>  
> -static void kiocb_end_write(struct io_kiocb *req)
> -{
> -	/*
> -	 * Tell lockdep we inherited freeze protection from submission
> -	 * thread.
> -	 */
> -	if (req->flags & REQ_F_ISREG) {
> -		struct super_block *sb = file_inode(req->file)->i_sb;
> -
> -		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
> -		sb_end_write(sb);
> -	}
> -}
> -
>  /*
>   * Trigger the notifications after having done some IO, and finish the write
>   * accounting, if any.
> @@ -243,7 +229,7 @@ static void io_req_io_end(struct io_kiocb *req)
>  	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
>  
>  	if (rw->kiocb.ki_flags & IOCB_WRITE) {
> -		kiocb_end_write(req);
> +		kiocb_end_write(&rw->kiocb);
>  		fsnotify_modify(req->file);
>  	} else {
>  		fsnotify_access(req->file);
> @@ -313,7 +299,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
>  	struct io_kiocb *req = cmd_to_io_kiocb(rw);
>  
>  	if (kiocb->ki_flags & IOCB_WRITE)
> -		kiocb_end_write(req);
> +		kiocb_end_write(kiocb);
>  	if (unlikely(res != req->cqe.res)) {
>  		if (res == -EAGAIN && io_rw_should_reissue(req)) {
>  			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
> @@ -902,18 +888,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
>  		return ret;
>  	}
>  
> -	/*
> -	 * Open-code file_start_write here to grab freeze protection,
> -	 * which will be released by another thread in
> -	 * io_complete_rw().  Fool lockdep by telling it the lock got
> -	 * released so that it doesn't complain about the held lock when
> -	 * we return to userspace.
> -	 */
> -	if (req->flags & REQ_F_ISREG) {
> -		sb_start_write(file_inode(req->file)->i_sb);
> -		__sb_writers_release(file_inode(req->file)->i_sb,
> -					SB_FREEZE_WRITE);
> -	}
> +	if (req->flags & REQ_F_ISREG)
> +		kiocb_start_write(kiocb);
>  	kiocb->ki_flags |= IOCB_WRITE;
>  
>  	if (likely(req->file->f_op->write_iter))
> @@ -961,7 +937,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
>  				io->bytes_done += ret2;
>  
>  			if (kiocb->ki_flags & IOCB_WRITE)
> -				kiocb_end_write(req);
> +				kiocb_end_write(kiocb);
>  			return ret ? ret : -EAGAIN;
>  		}
>  done:
> @@ -972,7 +948,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
>  		ret = io_setup_async_rw(req, iovec, s, false);
>  		if (!ret) {
>  			if (kiocb->ki_flags & IOCB_WRITE)
> -				kiocb_end_write(req);
> +				kiocb_end_write(kiocb);
>  			return -EAGAIN;
>  		}
>  		return ret;
> -- 
> 2.34.1
>
Jens Axboe Aug. 16, 2023, 3:28 p.m. UTC | #2
On 8/16/23 2:54 AM, Amir Goldstein wrote:
> aio, io_uring, cachefiles and overlayfs, all open code an ugly variant
> of file_{start,end}_write() to silence lockdep warnings.
> 
> Create helpers for this lockdep dance and use the helpers in all the
> callers.
> 
> Use a new iocb flag IOCB_WRITE_STARTED to indicate if sb_start_write()
> was called.

Looks better now, but I think you should split this into a prep patch
that adds the helpers, and then one for each conversion. We've had bugs
with this accounting before which causes fs freeze issues, would be
prudent to have it split because of that.

> diff --git a/fs/aio.c b/fs/aio.c
> index 77e33619de40..16fb3ac2093b 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1444,17 +1444,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
>  	if (!list_empty_careful(&iocb->ki_list))
>  		aio_remove_iocb(iocb);
>  
> -	if (kiocb->ki_flags & IOCB_WRITE) {
> -		struct inode *inode = file_inode(kiocb->ki_filp);
> -
> -		/*
> -		 * Tell lockdep we inherited freeze protection from submission
> -		 * thread.
> -		 */
> -		if (S_ISREG(inode->i_mode))
> -			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
> -		file_end_write(kiocb->ki_filp);
> -	}
> +	if (kiocb->ki_flags & IOCB_WRITE)
> +		kiocb_end_write(kiocb);

Can't we just call kiocb_end_write() here, it checks WRITE_STARTED
anyway? Not a big deal, and honestly I'd rather just get rid of
WRITE_STARTED if we're not using it like that. It doesn't serve much of
a purpose, if we're gating this one IOCB_WRITE anyway (which I do like
better than WRITE_STARTED). And it avoids writing to the kiocb at the
end too, which is a nice win.

> index b2adee67f9b2..8e5d410a1be5 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -338,6 +338,8 @@ enum rw_hint {
>  #define IOCB_NOIO		(1 << 20)
>  /* can use bio alloc cache */
>  #define IOCB_ALLOC_CACHE	(1 << 21)
> +/* file_start_write() was called */
> +#define IOCB_WRITE_STARTED	(1 << 22)
>  
>  /* for use in trace events */
>  #define TRACE_IOCB_STRINGS \
> @@ -351,7 +353,8 @@ enum rw_hint {
>  	{ IOCB_WRITE,		"WRITE" }, \
>  	{ IOCB_WAITQ,		"WAITQ" }, \
>  	{ IOCB_NOIO,		"NOIO" }, \
> -	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }
> +	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
> +	{ IOCB_WRITE_STARTED,	"WRITE_STARTED" }
>  
>  struct kiocb {
>  	struct file		*ki_filp;

These changes will conflict with other changes in linux-next that are
going upstream. I'd prefer to stage this one after those changes, once
we get to a version that looks good to everybody.
Amir Goldstein Aug. 16, 2023, 6:58 p.m. UTC | #3
On Wed, Aug 16, 2023 at 6:28 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 8/16/23 2:54 AM, Amir Goldstein wrote:
> > aio, io_uring, cachefiles and overlayfs, all open code an ugly variant
> > of file_{start,end}_write() to silence lockdep warnings.
> >
> > Create helpers for this lockdep dance and use the helpers in all the
> > callers.
> >
> > Use a new iocb flag IOCB_WRITE_STARTED to indicate if sb_start_write()
> > was called.
>
> Looks better now, but I think you should split this into a prep patch
> that adds the helpers, and then one for each conversion. We've had bugs
> with this accounting before which causes fs freeze issues, would be
> prudent to have it split because of that.
>

Sure, no problem.

> > diff --git a/fs/aio.c b/fs/aio.c
> > index 77e33619de40..16fb3ac2093b 100644
> > --- a/fs/aio.c
> > +++ b/fs/aio.c
> > @@ -1444,17 +1444,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
> >       if (!list_empty_careful(&iocb->ki_list))
> >               aio_remove_iocb(iocb);
> >
> > -     if (kiocb->ki_flags & IOCB_WRITE) {
> > -             struct inode *inode = file_inode(kiocb->ki_filp);
> > -
> > -             /*
> > -              * Tell lockdep we inherited freeze protection from submission
> > -              * thread.
> > -              */
> > -             if (S_ISREG(inode->i_mode))
> > -                     __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
> > -             file_end_write(kiocb->ki_filp);
> > -     }
> > +     if (kiocb->ki_flags & IOCB_WRITE)
> > +             kiocb_end_write(kiocb);
>
> Can't we just call kiocb_end_write() here, it checks WRITE_STARTED
> anyway? Not a big deal, and honestly I'd rather just get rid of
> WRITE_STARTED if we're not using it like that. It doesn't serve much of
> a purpose, if we're gating this one IOCB_WRITE anyway (which I do like
> better than WRITE_STARTED). And it avoids writing to the kiocb at the
> end too, which is a nice win.
>

What I don't like about it is that kiocb_{start,end}_write() calls will
not be balanced, so it is harder for a code reviewer to realize that the
code is correct (as opposed to have excess end_write calls).
That's the reason I had ISREG check inside the helpers in v1, so like
file_{start,end}_write(), the calls will be balanced and easy to review.

I am not sure, but I think that getting rid of WRITE_STARTED will
make the code more subtle, because right now, IOCB_WRITE is
not set by kiocb_start_write() and many times it is set much sooner.
So I'd rather keep the WRITE_STARTED flag for a more roust code
if that's ok with you.

> > index b2adee67f9b2..8e5d410a1be5 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -338,6 +338,8 @@ enum rw_hint {
> >  #define IOCB_NOIO            (1 << 20)
> >  /* can use bio alloc cache */
> >  #define IOCB_ALLOC_CACHE     (1 << 21)
> > +/* file_start_write() was called */
> > +#define IOCB_WRITE_STARTED   (1 << 22)
> >
> >  /* for use in trace events */
> >  #define TRACE_IOCB_STRINGS \
> > @@ -351,7 +353,8 @@ enum rw_hint {
> >       { IOCB_WRITE,           "WRITE" }, \
> >       { IOCB_WAITQ,           "WAITQ" }, \
> >       { IOCB_NOIO,            "NOIO" }, \
> > -     { IOCB_ALLOC_CACHE,     "ALLOC_CACHE" }
> > +     { IOCB_ALLOC_CACHE,     "ALLOC_CACHE" }, \
> > +     { IOCB_WRITE_STARTED,   "WRITE_STARTED" }
> >
> >  struct kiocb {
> >       struct file             *ki_filp;
>
> These changes will conflict with other changes in linux-next that are
> going upstream. I'd prefer to stage this one after those changes, once
> we get to a version that looks good to everybody.

Changes coming to linux-next from your tree?
I was basing my patches on Christian's vfs.all branch.
Do you mean that you would like to stage this cleanup?
It's fine by me.

Thanks,
Amir.
Jens Axboe Aug. 16, 2023, 7:13 p.m. UTC | #4
On 8/16/23 12:58 PM, Amir Goldstein wrote:
>>> diff --git a/fs/aio.c b/fs/aio.c
>>> index 77e33619de40..16fb3ac2093b 100644
>>> --- a/fs/aio.c
>>> +++ b/fs/aio.c
>>> @@ -1444,17 +1444,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
>>>       if (!list_empty_careful(&iocb->ki_list))
>>>               aio_remove_iocb(iocb);
>>>
>>> -     if (kiocb->ki_flags & IOCB_WRITE) {
>>> -             struct inode *inode = file_inode(kiocb->ki_filp);
>>> -
>>> -             /*
>>> -              * Tell lockdep we inherited freeze protection from submission
>>> -              * thread.
>>> -              */
>>> -             if (S_ISREG(inode->i_mode))
>>> -                     __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
>>> -             file_end_write(kiocb->ki_filp);
>>> -     }
>>> +     if (kiocb->ki_flags & IOCB_WRITE)
>>> +             kiocb_end_write(kiocb);
>>
>> Can't we just call kiocb_end_write() here, it checks WRITE_STARTED
>> anyway? Not a big deal, and honestly I'd rather just get rid of
>> WRITE_STARTED if we're not using it like that. It doesn't serve much of
>> a purpose, if we're gating this one IOCB_WRITE anyway (which I do like
>> better than WRITE_STARTED). And it avoids writing to the kiocb at the
>> end too, which is a nice win.
>>
> 
> What I don't like about it is that kiocb_{start,end}_write() calls will
> not be balanced, so it is harder for a code reviewer to realize that the
> code is correct (as opposed to have excess end_write calls).
> That's the reason I had ISREG check inside the helpers in v1, so like
> file_{start,end}_write(), the calls will be balanced and easy to review.

If you just gate it on IOCB_WRITE and local IS_REG test, then it should
be balanced.

> I am not sure, but I think that getting rid of WRITE_STARTED will
> make the code more subtle, because right now, IOCB_WRITE is
> not set by kiocb_start_write() and many times it is set much sooner.
> So I'd rather keep the WRITE_STARTED flag for a more roust code
> if that's ok with you.

Adding random flags to protect against that is not a good idea imho. It
adds overhead and while it may seem like it's hardening, it's also then
just making it easier to misuse.

I would start with just adding the helpers, and having the callers gate
them with IOCB_WRITE && IS_REG like they do now.

>> index b2adee67f9b2..8e5d410a1be5 100644
>>> --- a/include/linux/fs.h
>>> +++ b/include/linux/fs.h
>>> @@ -338,6 +338,8 @@ enum rw_hint {
>>>  #define IOCB_NOIO            (1 << 20)
>>>  /* can use bio alloc cache */
>>>  #define IOCB_ALLOC_CACHE     (1 << 21)
>>> +/* file_start_write() was called */
>>> +#define IOCB_WRITE_STARTED   (1 << 22)
>>>
>>>  /* for use in trace events */
>>>  #define TRACE_IOCB_STRINGS \
>>> @@ -351,7 +353,8 @@ enum rw_hint {
>>>       { IOCB_WRITE,           "WRITE" }, \
>>>       { IOCB_WAITQ,           "WAITQ" }, \
>>>       { IOCB_NOIO,            "NOIO" }, \
>>> -     { IOCB_ALLOC_CACHE,     "ALLOC_CACHE" }
>>> +     { IOCB_ALLOC_CACHE,     "ALLOC_CACHE" }, \
>>> +     { IOCB_WRITE_STARTED,   "WRITE_STARTED" }
>>>
>>>  struct kiocb {
>>>       struct file             *ki_filp;
>>
>> These changes will conflict with other changes in linux-next that are
>> going upstream. I'd prefer to stage this one after those changes, once
>> we get to a version that looks good to everybody.
> 
> Changes coming to linux-next from your tree?
> I was basing my patches on Christian's vfs.all branch.
> Do you mean that you would like to stage this cleanup?
> It's fine by me.

From the xfs tree, if you look at linux-next you should see them. I
don't care who stages it, but I don't want to create needless merge
conflicts because people weren't aware of these conflicts.
Amir Goldstein Aug. 17, 2023, 1:18 p.m. UTC | #5
On Wed, Aug 16, 2023 at 10:13 PM Jens Axboe <axboe@kernel.dk> wrote:
>
> On 8/16/23 12:58 PM, Amir Goldstein wrote:
> >>> diff --git a/fs/aio.c b/fs/aio.c
> >>> index 77e33619de40..16fb3ac2093b 100644
> >>> --- a/fs/aio.c
> >>> +++ b/fs/aio.c
> >>> @@ -1444,17 +1444,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res)
> >>>       if (!list_empty_careful(&iocb->ki_list))
> >>>               aio_remove_iocb(iocb);
> >>>
> >>> -     if (kiocb->ki_flags & IOCB_WRITE) {
> >>> -             struct inode *inode = file_inode(kiocb->ki_filp);
> >>> -
> >>> -             /*
> >>> -              * Tell lockdep we inherited freeze protection from submission
> >>> -              * thread.
> >>> -              */
> >>> -             if (S_ISREG(inode->i_mode))
> >>> -                     __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
> >>> -             file_end_write(kiocb->ki_filp);
> >>> -     }
> >>> +     if (kiocb->ki_flags & IOCB_WRITE)
> >>> +             kiocb_end_write(kiocb);
> >>
> >> Can't we just call kiocb_end_write() here, it checks WRITE_STARTED
> >> anyway? Not a big deal, and honestly I'd rather just get rid of
> >> WRITE_STARTED if we're not using it like that. It doesn't serve much of
> >> a purpose, if we're gating this one IOCB_WRITE anyway (which I do like
> >> better than WRITE_STARTED). And it avoids writing to the kiocb at the
> >> end too, which is a nice win.
> >>
> >
> > What I don't like about it is that kiocb_{start,end}_write() calls will
> > not be balanced, so it is harder for a code reviewer to realize that the
> > code is correct (as opposed to have excess end_write calls).
> > That's the reason I had ISREG check inside the helpers in v1, so like
> > file_{start,end}_write(), the calls will be balanced and easy to review.
>
> If you just gate it on IOCB_WRITE and local IS_REG test, then it should
> be balanced.
>
> > I am not sure, but I think that getting rid of WRITE_STARTED will
> > make the code more subtle, because right now, IOCB_WRITE is
> > not set by kiocb_start_write() and many times it is set much sooner.
> > So I'd rather keep the WRITE_STARTED flag for a more roust code
> > if that's ok with you.
>
> Adding random flags to protect against that is not a good idea imho. It
> adds overhead and while it may seem like it's hardening, it's also then
> just making it easier to misuse.
>
> I would start with just adding the helpers, and having the callers gate
> them with IOCB_WRITE && IS_REG like they do now.
>

All right.

> >> index b2adee67f9b2..8e5d410a1be5 100644
> >>> --- a/include/linux/fs.h
> >>> +++ b/include/linux/fs.h
> >>> @@ -338,6 +338,8 @@ enum rw_hint {
> >>>  #define IOCB_NOIO            (1 << 20)
> >>>  /* can use bio alloc cache */
> >>>  #define IOCB_ALLOC_CACHE     (1 << 21)
> >>> +/* file_start_write() was called */
> >>> +#define IOCB_WRITE_STARTED   (1 << 22)
> >>>
> >>>  /* for use in trace events */
> >>>  #define TRACE_IOCB_STRINGS \
> >>> @@ -351,7 +353,8 @@ enum rw_hint {
> >>>       { IOCB_WRITE,           "WRITE" }, \
> >>>       { IOCB_WAITQ,           "WAITQ" }, \
> >>>       { IOCB_NOIO,            "NOIO" }, \
> >>> -     { IOCB_ALLOC_CACHE,     "ALLOC_CACHE" }
> >>> +     { IOCB_ALLOC_CACHE,     "ALLOC_CACHE" }, \
> >>> +     { IOCB_WRITE_STARTED,   "WRITE_STARTED" }
> >>>
> >>>  struct kiocb {
> >>>       struct file             *ki_filp;
> >>
> >> These changes will conflict with other changes in linux-next that are
> >> going upstream. I'd prefer to stage this one after those changes, once
> >> we get to a version that looks good to everybody.
> >
> > Changes coming to linux-next from your tree?
> > I was basing my patches on Christian's vfs.all branch.
> > Do you mean that you would like to stage this cleanup?
> > It's fine by me.
>
> From the xfs tree, if you look at linux-next you should see them. I
> don't care who stages it, but I don't want to create needless merge
> conflicts because people weren't aware of these conflicts.
>

Ok. I got rid of IOCB_WRITE_STARTED, so there is no conflict now.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/aio.c b/fs/aio.c
index 77e33619de40..16fb3ac2093b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1444,17 +1444,8 @@  static void aio_complete_rw(struct kiocb *kiocb, long res)
 	if (!list_empty_careful(&iocb->ki_list))
 		aio_remove_iocb(iocb);
 
-	if (kiocb->ki_flags & IOCB_WRITE) {
-		struct inode *inode = file_inode(kiocb->ki_filp);
-
-		/*
-		 * Tell lockdep we inherited freeze protection from submission
-		 * thread.
-		 */
-		if (S_ISREG(inode->i_mode))
-			__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
-		file_end_write(kiocb->ki_filp);
-	}
+	if (kiocb->ki_flags & IOCB_WRITE)
+		kiocb_end_write(kiocb);
 
 	iocb->ki_res.res = res;
 	iocb->ki_res.res2 = 0;
@@ -1581,17 +1572,8 @@  static int aio_write(struct kiocb *req, const struct iocb *iocb,
 		return ret;
 	ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter));
 	if (!ret) {
-		/*
-		 * Open-code file_start_write here to grab freeze protection,
-		 * which will be released by another thread in
-		 * aio_complete_rw().  Fool lockdep by telling it the lock got
-		 * released so that it doesn't complain about the held lock when
-		 * we return to userspace.
-		 */
-		if (S_ISREG(file_inode(file)->i_mode)) {
-			sb_start_write(file_inode(file)->i_sb);
-			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
-		}
+		if (S_ISREG(file_inode(file)->i_mode))
+			kiocb_start_write(req);
 		req->ki_flags |= IOCB_WRITE;
 		aio_rw_done(req, call_write_iter(file, req, &iter));
 	}
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 175a25fcade8..6e16f7c116da 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,9 +259,7 @@  static void cachefiles_write_complete(struct kiocb *iocb, long ret)
 
 	_enter("%ld", ret);
 
-	/* Tell lockdep we inherited freeze protection from submission thread */
-	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
-	__sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+	kiocb_end_write(iocb);
 
 	if (ret < 0)
 		trace_cachefiles_io_error(object, inode, ret,
@@ -286,7 +284,6 @@  int __cachefiles_write(struct cachefiles_object *object,
 {
 	struct cachefiles_cache *cache;
 	struct cachefiles_kiocb *ki;
-	struct inode *inode;
 	unsigned int old_nofs;
 	ssize_t ret;
 	size_t len = iov_iter_count(iter);
@@ -322,19 +319,12 @@  int __cachefiles_write(struct cachefiles_object *object,
 		ki->iocb.ki_complete = cachefiles_write_complete;
 	atomic_long_add(ki->b_writing, &cache->b_writing);
 
-	/* Open-code file_start_write here to grab freeze protection, which
-	 * will be released by another thread in aio_complete_rw().  Fool
-	 * lockdep by telling it the lock got released so that it doesn't
-	 * complain about the held lock when we return to userspace.
-	 */
-	inode = file_inode(file);
-	__sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
-	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+	kiocb_start_write(ki);
 
 	get_file(ki->iocb.ki_filp);
 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
 
-	trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len);
+	trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len);
 	old_nofs = memalloc_nofs_save();
 	ret = cachefiles_inject_write_error();
 	if (ret == 0)
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 21245b00722a..c756edb9bd4c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -290,10 +290,7 @@  static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
 	if (iocb->ki_flags & IOCB_WRITE) {
 		struct inode *inode = file_inode(orig_iocb->ki_filp);
 
-		/* Actually acquired in ovl_write_iter() */
-		__sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb,
-				      SB_FREEZE_WRITE);
-		file_end_write(iocb->ki_filp);
+		kiocb_end_write(iocb);
 		ovl_copyattr(inode);
 	}
 
@@ -362,6 +359,9 @@  static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	return ret;
 }
 
+/* IOCB flags that may be propagated to real file io */
+#define OVL_IOCB_MASK ~(IOCB_WRITE_STARTED)
+
 static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
@@ -369,7 +369,7 @@  static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	struct fd real;
 	const struct cred *old_cred;
 	ssize_t ret;
-	int ifl = iocb->ki_flags;
+	int ifl = iocb->ki_flags & OVL_IOCB_MASK;
 
 	if (!iov_iter_count(iter))
 		return 0;
@@ -409,10 +409,6 @@  static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		if (!aio_req)
 			goto out;
 
-		file_start_write(real.file);
-		/* Pacify lockdep, same trick as done in aio_write() */
-		__sb_writers_release(file_inode(real.file)->i_sb,
-				     SB_FREEZE_WRITE);
 		aio_req->fd = real;
 		real.flags = 0;
 		aio_req->orig_iocb = iocb;
@@ -420,6 +416,7 @@  static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		aio_req->iocb.ki_flags = ifl;
 		aio_req->iocb.ki_complete = ovl_aio_rw_complete;
 		refcount_set(&aio_req->ref, 2);
+		kiocb_start_write(&aio_req->iocb);
 		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
 		ovl_aio_put(aio_req);
 		if (ret != -EIOCBQUEUED)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b2adee67f9b2..8e5d410a1be5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -338,6 +338,8 @@  enum rw_hint {
 #define IOCB_NOIO		(1 << 20)
 /* can use bio alloc cache */
 #define IOCB_ALLOC_CACHE	(1 << 21)
+/* file_start_write() was called */
+#define IOCB_WRITE_STARTED	(1 << 22)
 
 /* for use in trace events */
 #define TRACE_IOCB_STRINGS \
@@ -351,7 +353,8 @@  enum rw_hint {
 	{ IOCB_WRITE,		"WRITE" }, \
 	{ IOCB_WAITQ,		"WAITQ" }, \
 	{ IOCB_NOIO,		"NOIO" }, \
-	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }
+	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
+	{ IOCB_WRITE_STARTED,	"WRITE_STARTED" }
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -2545,6 +2548,13 @@  static inline bool inode_wrong_type(const struct inode *inode, umode_t mode)
 	return (inode->i_mode ^ mode) & S_IFMT;
 }
 
+/**
+ * file_start_write - get write access to a superblock for regular file io
+ * @file: the file we want to write to
+ *
+ * This is a variant of sb_start_write() which is a noop on non-regualr file.
+ * Should be matched with a call to file_end_write().
+ */
 static inline void file_start_write(struct file *file)
 {
 	if (!S_ISREG(file_inode(file)->i_mode))
@@ -2559,11 +2569,59 @@  static inline bool file_start_write_trylock(struct file *file)
 	return sb_start_write_trylock(file_inode(file)->i_sb);
 }
 
+/**
+ * file_end_write - drop write access to a superblock of a regular file
+ * @file: the file we wrote to
+ *
+ * Should be matched with a call to file_start_write().
+ */
 static inline void file_end_write(struct file *file)
 {
 	if (!S_ISREG(file_inode(file)->i_mode))
 		return;
-	__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+	sb_end_write(file_inode(file)->i_sb);
+}
+
+/**
+ * kiocb_start_write - get write access to a superblock for async file io
+ * @iocb: the io context we want to submit the write with
+ *
+ * This is a variant of file_start_write() for async io submission.
+ * Should be matched with a call to kiocb_end_write().
+ */
+static inline void kiocb_start_write(struct kiocb *iocb)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (WARN_ON_ONCE(iocb->ki_flags & IOCB_WRITE_STARTED))
+		return;
+	sb_start_write(inode->i_sb);
+	/*
+	 * Fool lockdep by telling it the lock got released so that it
+	 * doesn't complain about the held lock when we return to userspace.
+	 */
+	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+	iocb->ki_flags |= IOCB_WRITE_STARTED;
+}
+
+/**
+ * kiocb_end_write - drop write access to a superblock after async file io
+ * @iocb: the io context we sumbitted the write with
+ *
+ * Should be matched with a call to kiocb_start_write().
+ */
+static inline void kiocb_end_write(struct kiocb *iocb)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	if (!(iocb->ki_flags & IOCB_WRITE_STARTED))
+		return;
+	/*
+	 * Tell lockdep we inherited freeze protection from submission thread.
+	 */
+	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+	sb_end_write(inode->i_sb);
+	iocb->ki_flags &= ~IOCB_WRITE_STARTED;
 }
 
 /*
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1bce2208b65c..362d48493096 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -220,20 +220,6 @@  static bool io_rw_should_reissue(struct io_kiocb *req)
 }
 #endif
 
-static void kiocb_end_write(struct io_kiocb *req)
-{
-	/*
-	 * Tell lockdep we inherited freeze protection from submission
-	 * thread.
-	 */
-	if (req->flags & REQ_F_ISREG) {
-		struct super_block *sb = file_inode(req->file)->i_sb;
-
-		__sb_writers_acquired(sb, SB_FREEZE_WRITE);
-		sb_end_write(sb);
-	}
-}
-
 /*
  * Trigger the notifications after having done some IO, and finish the write
  * accounting, if any.
@@ -243,7 +229,7 @@  static void io_req_io_end(struct io_kiocb *req)
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 
 	if (rw->kiocb.ki_flags & IOCB_WRITE) {
-		kiocb_end_write(req);
+		kiocb_end_write(&rw->kiocb);
 		fsnotify_modify(req->file);
 	} else {
 		fsnotify_access(req->file);
@@ -313,7 +299,7 @@  static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
 	struct io_kiocb *req = cmd_to_io_kiocb(rw);
 
 	if (kiocb->ki_flags & IOCB_WRITE)
-		kiocb_end_write(req);
+		kiocb_end_write(kiocb);
 	if (unlikely(res != req->cqe.res)) {
 		if (res == -EAGAIN && io_rw_should_reissue(req)) {
 			req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
@@ -902,18 +888,8 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		return ret;
 	}
 
-	/*
-	 * Open-code file_start_write here to grab freeze protection,
-	 * which will be released by another thread in
-	 * io_complete_rw().  Fool lockdep by telling it the lock got
-	 * released so that it doesn't complain about the held lock when
-	 * we return to userspace.
-	 */
-	if (req->flags & REQ_F_ISREG) {
-		sb_start_write(file_inode(req->file)->i_sb);
-		__sb_writers_release(file_inode(req->file)->i_sb,
-					SB_FREEZE_WRITE);
-	}
+	if (req->flags & REQ_F_ISREG)
+		kiocb_start_write(kiocb);
 	kiocb->ki_flags |= IOCB_WRITE;
 
 	if (likely(req->file->f_op->write_iter))
@@ -961,7 +937,7 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 				io->bytes_done += ret2;
 
 			if (kiocb->ki_flags & IOCB_WRITE)
-				kiocb_end_write(req);
+				kiocb_end_write(kiocb);
 			return ret ? ret : -EAGAIN;
 		}
 done:
@@ -972,7 +948,7 @@  int io_write(struct io_kiocb *req, unsigned int issue_flags)
 		ret = io_setup_async_rw(req, iovec, s, false);
 		if (!ret) {
 			if (kiocb->ki_flags & IOCB_WRITE)
-				kiocb_end_write(req);
+				kiocb_end_write(kiocb);
 			return -EAGAIN;
 		}
 		return ret;