diff mbox series

fs: allow calling kiocb_end_write() unmatched with kiocb_start_write()

Message ID 20231121132551.2337431-1-amir73il@gmail.com (mailing list archive)
State New
Headers show
Series fs: allow calling kiocb_end_write() unmatched with kiocb_start_write() | expand

Commit Message

Amir Goldstein Nov. 21, 2023, 1:25 p.m. UTC
We want to move kiocb_start_write() into vfs_iocb_iter_write(), after
the permission hook, but leave kiocb_end_write() in the write completion
handler of the callers of vfs_iocb_iter_write().

After this change, there will be no way of knowing in completion handler,
if write has failed before or after calling kiocb_start_write().

Add a flag IOCB_WRITE_STARTED, which is set and cleared internally by
kiocb_{start,end}_write(), so that kiocb_end_write() could be called for
cleanup of async write, whether it was successful or whether it failed
before or after calling kiocb_start_write().

This flag must not be copied by stacked filesystems (e.g. overlayfs)
that clone the iocb to another iocb for io request on a backing file.

Link: https://lore.kernel.org/r/CAOQ4uxihfJJRxxUhAmOwtD97Lg8PL8RgXw88rH1UfEeP8AtP+w@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---

Jens,

The kiocb_{start,end}_write() helpers were originally taken out of
an early version of the permission hooks cleanup patches [1].

This early version of the helpers had the IOCB_WRITE_STARTED flag, but
when I posted the helpers independently from the cleanup series, you had
correctly pointed out [2] that IOCB_WRITE_STARTED is not needed for any
of the existing callers of kiocb_{start,end}_write(), so I removed it for
the final version of the helpers that got merged.

When coming back to the permission hook cleanup, I see that the
IOCB_WRITE_STARTED flag is needed to allow the new semantics of calling
kiocb_start_write() inside vfs_iocb_iter_write() and kiocb_end_write()
in completion callback.

I realize these semantics are not great, but the alternative of moving
the permission hook from vfs_iocb_iter_write() into all the callers is
worse IMO.

Can you accept the solution with IOCB_WRITE_STARTED state flag?
Have a better idea for cleaner iocb issue semantics that will allow
calling the permission hook with freeze protection held?

Thanks,
Amir.

[1] https://lore.kernel.org/linux-fsdevel/20231114153321.1716028-1-amir73il@gmail.com/
[2] https://lore.kernel.org/linux-fsdevel/e6948836-1d9d-4219-9a21-a2ab442a9a34@kernel.dk/

 fs/overlayfs/file.c |  2 +-
 include/linux/fs.h  | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

Comments

Josef Bacik Nov. 21, 2023, 9 p.m. UTC | #1
On Tue, Nov 21, 2023 at 03:25:51PM +0200, Amir Goldstein wrote:
> We want to move kiocb_start_write() into vfs_iocb_iter_write(), after
> the permission hook, but leave kiocb_end_write() in the write completion
> handler of the callers of vfs_iocb_iter_write().
> 
> After this change, there will be no way of knowing in completion handler,
> if write has failed before or after calling kiocb_start_write().
> 
> Add a flag IOCB_WRITE_STARTED, which is set and cleared internally by
> kiocb_{start,end}_write(), so that kiocb_end_write() could be called for
> cleanup of async write, whether it was successful or whether it failed
> before or after calling kiocb_start_write().
> 
> This flag must not be copied by stacked filesystems (e.g. overlayfs)
> that clone the iocb to another iocb for io request on a backing file.
> 
> Link: https://lore.kernel.org/r/CAOQ4uxihfJJRxxUhAmOwtD97Lg8PL8RgXw88rH1UfEeP8AtP+w@mail.gmail.com/
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

This is only a problem for cachefiles and overlayfs, and really just for
cachefiles because of the error handling thing.

What if instead we made vfs_iocb_iter_write() call kiocb_end_write() in the ret
!= EIOCBQUEUED case, that way it is in charge of the start and the end, and the
only case where the file system has to worry about is the actual io completion
path when the kiocb is completed.

The result is something like what I've pasted below, completely uncompiled and
untested.  Thanks,

Josef

diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 009d23cd435b..5857241c5918 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
 
 	_enter("%ld", ret);
 
-	kiocb_end_write(iocb);
+	if (ki->was_async)
+		kiocb_end_write(iocb);
 
 	if (ret < 0)
 		trace_cachefiles_io_error(object, inode, ret,
@@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object,
 		ki->iocb.ki_complete = cachefiles_write_complete;
 	atomic_long_add(ki->b_writing, &cache->b_writing);
 
-	kiocb_start_write(&ki->iocb);
-
 	get_file(ki->iocb.ki_filp);
 	cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
 
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 131621daeb13..19d2682d28f9 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -295,11 +295,6 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
 	struct kiocb *iocb = &aio_req->iocb;
 	struct kiocb *orig_iocb = aio_req->orig_iocb;
 
-	if (iocb->ki_flags & IOCB_WRITE) {
-		kiocb_end_write(iocb);
-		ovl_file_modified(orig_iocb->ki_filp);
-	}
-
 	orig_iocb->ki_pos = iocb->ki_pos;
 	ovl_aio_put(aio_req);
 }
@@ -310,6 +305,11 @@ static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
 						   struct ovl_aio_req, iocb);
 	struct kiocb *orig_iocb = aio_req->orig_iocb;
 
+	if (iocb->ki_flags & IOCB_WRITE) {
+		kiocb_end_write(iocb);
+		ovl_file_modified(orig_iocb->ki_filp);
+	}
+
 	ovl_aio_cleanup_handler(aio_req);
 	orig_iocb->ki_complete(orig_iocb, res);
 }
@@ -458,7 +458,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		aio_req->iocb.ki_flags = ifl;
 		aio_req->iocb.ki_complete = ovl_aio_queue_completion;
 		refcount_set(&aio_req->ref, 2);
-		kiocb_start_write(&aio_req->iocb);
 		ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
 		ovl_aio_put(aio_req);
 		if (ret != -EIOCBQUEUED)
diff --git a/fs/read_write.c b/fs/read_write.c
index 4771701c896b..6ec3abed43dc 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -821,7 +821,10 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
 	if (ret < 0)
 		return ret;
 
+	kiocb_start_write(iocb);
 	ret = call_read_iter(file, iocb, iter);
+	if (ret != -EIOCBQUEUED)
+		kiocb_end_write(iocb);
 out:
 	if (ret >= 0)
 		fsnotify_access(file);
Dave Chinner Nov. 21, 2023, 9:30 p.m. UTC | #2
On Tue, Nov 21, 2023 at 04:00:32PM -0500, Josef Bacik wrote:
> On Tue, Nov 21, 2023 at 03:25:51PM +0200, Amir Goldstein wrote:
> > We want to move kiocb_start_write() into vfs_iocb_iter_write(), after
> > the permission hook, but leave kiocb_end_write() in the write completion
> > handler of the callers of vfs_iocb_iter_write().
> > 
> > After this change, there will be no way of knowing in completion handler,
> > if write has failed before or after calling kiocb_start_write().
> > 
> > Add a flag IOCB_WRITE_STARTED, which is set and cleared internally by
> > kiocb_{start,end}_write(), so that kiocb_end_write() could be called for
> > cleanup of async write, whether it was successful or whether it failed
> > before or after calling kiocb_start_write().
> > 
> > This flag must not be copied by stacked filesystems (e.g. overlayfs)
> > that clone the iocb to another iocb for io request on a backing file.
> > 
> > Link: https://lore.kernel.org/r/CAOQ4uxihfJJRxxUhAmOwtD97Lg8PL8RgXw88rH1UfEeP8AtP+w@mail.gmail.com/
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> 
> This is only a problem for cachefiles and overlayfs, and really just for
> cachefiles because of the error handling thing.
> 
> What if instead we made vfs_iocb_iter_write() call kiocb_end_write() in the ret
> != EIOCBQUEUED case, that way it is in charge of the start and the end, and the
> only case where the file system has to worry about is the actual io completion
> path when the kiocb is completed.
> 
> The result is something like what I've pasted below, completely uncompiled and
> untested.  Thanks,

I like this a lot better than an internal flag that allows
unbalanced start/end calls to proliferate.  I find it much easier to
read the code, determine the correct cleanup is being done and
maintain the code in future when calls need to be properly
paired....

-Dave.
Amir Goldstein Nov. 22, 2023, 5:38 a.m. UTC | #3
On Tue, Nov 21, 2023 at 11:30 PM Dave Chinner <david@fromorbit.com> wrote:
>
> On Tue, Nov 21, 2023 at 04:00:32PM -0500, Josef Bacik wrote:
> > On Tue, Nov 21, 2023 at 03:25:51PM +0200, Amir Goldstein wrote:
> > > We want to move kiocb_start_write() into vfs_iocb_iter_write(), after
> > > the permission hook, but leave kiocb_end_write() in the write completion
> > > handler of the callers of vfs_iocb_iter_write().
> > >
> > > After this change, there will be no way of knowing in completion handler,
> > > if write has failed before or after calling kiocb_start_write().
> > >
> > > Add a flag IOCB_WRITE_STARTED, which is set and cleared internally by
> > > kiocb_{start,end}_write(), so that kiocb_end_write() could be called for
> > > cleanup of async write, whether it was successful or whether it failed
> > > before or after calling kiocb_start_write().
> > >
> > > This flag must not be copied by stacked filesystems (e.g. overlayfs)
> > > that clone the iocb to another iocb for io request on a backing file.
> > >
> > > Link: https://lore.kernel.org/r/CAOQ4uxihfJJRxxUhAmOwtD97Lg8PL8RgXw88rH1UfEeP8AtP+w@mail.gmail.com/
> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> >
> > This is only a problem for cachefiles and overlayfs, and really just for
> > cachefiles because of the error handling thing.
> >
> > What if instead we made vfs_iocb_iter_write() call kiocb_end_write() in the ret
> > != EIOCBQUEUED case, that way it is in charge of the start and the end, and the
> > only case where the file system has to worry about is the actual io completion
> > path when the kiocb is completed.
> >
> > The result is something like what I've pasted below, completely uncompiled and
> > untested.  Thanks,
>
> I like this a lot better than an internal flag that allows
> unbalanced start/end calls to proliferate.  I find it much easier to
> read the code, determine the correct cleanup is being done and
> maintain the code in future when calls need to be properly
> paired....

Yes! I like it too.

Thank you Josef!
I'll test that and post v2 with your RVBs.

Amir.
Christoph Hellwig Nov. 22, 2023, 6:50 a.m. UTC | #4
On Wed, Nov 22, 2023 at 08:30:01AM +1100, Dave Chinner wrote:
> I like this a lot better than an internal flag that allows
> unbalanced start/end calls to proliferate.  I find it much easier to
> read the code, determine the correct cleanup is being done and
> maintain the code in future when calls need to be properly
> paired....

Agreed.
Christian Brauner Nov. 22, 2023, 10:20 a.m. UTC | #5
On Tue, Nov 21, 2023 at 10:50:29PM -0800, Christoph Hellwig wrote:
> On Wed, Nov 22, 2023 at 08:30:01AM +1100, Dave Chinner wrote:
> > I like this a lot better than an internal flag that allows
> > unbalanced start/end calls to proliferate.  I find it much easier to
> > read the code, determine the correct cleanup is being done and
> > maintain the code in future when calls need to be properly
> > paired....
> 
> Agreed.

Agreed.
diff mbox series

Patch

diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 131621daeb13..e4baa4ea5c95 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -455,7 +455,7 @@  static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 
 		aio_req->orig_iocb = iocb;
 		kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
-		aio_req->iocb.ki_flags = ifl;
+		aio_req->iocb.ki_flags &= ifl;
 		aio_req->iocb.ki_complete = ovl_aio_queue_completion;
 		refcount_set(&aio_req->ref, 2);
 		kiocb_start_write(&aio_req->iocb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98b7a7a8c42e..64414e146e1e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -352,6 +352,8 @@  enum rw_hint {
  * unrelated IO (like cache flushing, new IO generation, etc).
  */
 #define IOCB_DIO_CALLER_COMP	(1 << 22)
+/* file_start_write() was called */
+#define IOCB_WRITE_STARTED	(1 << 23)
 
 /* for use in trace events */
 #define TRACE_IOCB_STRINGS \
@@ -366,7 +368,8 @@  enum rw_hint {
 	{ IOCB_WAITQ,		"WAITQ" }, \
 	{ IOCB_NOIO,		"NOIO" }, \
 	{ IOCB_ALLOC_CACHE,	"ALLOC_CACHE" }, \
-	{ IOCB_DIO_CALLER_COMP,	"CALLER_COMP" }
+	{ IOCB_DIO_CALLER_COMP,	"CALLER_COMP" }, \
+	{ IOCB_WRITE_STARTED,	"WRITE_STARTED" }
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -2183,12 +2186,15 @@  static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 	};
 }
 
+/* IOCB flags associated with ki_filp state must not be cloned */
+#define IOCB_CLONE_FLAGS_MASK	(~IOCB_WRITE_STARTED)
+
 static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
 			       struct file *filp)
 {
 	*kiocb = (struct kiocb) {
 		.ki_filp = filp,
-		.ki_flags = kiocb_src->ki_flags,
+		.ki_flags = kiocb_src->ki_flags & IOCB_CLONE_FLAGS_MASK,
 		.ki_ioprio = kiocb_src->ki_ioprio,
 		.ki_pos = kiocb_src->ki_pos,
 	};
@@ -2744,12 +2750,16 @@  static inline void kiocb_start_write(struct kiocb *iocb)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 
+	if (WARN_ON_ONCE(iocb->ki_flags & IOCB_WRITE_STARTED))
+		return;
+
 	sb_start_write(inode->i_sb);
 	/*
 	 * Fool lockdep by telling it the lock got released so that it
 	 * doesn't complain about the held lock when we return to userspace.
 	 */
 	__sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+	iocb->ki_flags |= IOCB_WRITE_STARTED;
 }
 
 /**
@@ -2762,11 +2772,15 @@  static inline void kiocb_end_write(struct kiocb *iocb)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 
+	if (!(iocb->ki_flags & IOCB_WRITE_STARTED))
+		return;
+
 	/*
 	 * Tell lockdep we inherited freeze protection from submission thread.
 	 */
 	__sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
 	sb_end_write(inode->i_sb);
+	iocb->ki_flags &= ~IOCB_WRITE_STARTED;
 }
 
 /*