Message ID | 20240910043949.3481298-7-hch@lst.de (mailing list archive) |
---|---|
State | Not Applicable, archived |
Headers | show |
Series | [01/12] iomap: handle a post-direct I/O invalidate race in iomap_write_delalloc_release | expand |
On Tue, Sep 10, 2024 at 07:39:08AM +0300, Christoph Hellwig wrote: > Split a helper from xfs_file_write_checks that just deal with the > post-EOF zeroing to keep the code readable. > > Signed-off-by: Christoph Hellwig <hch@lst.de> > --- > fs/xfs/xfs_file.c | 133 ++++++++++++++++++++++++++-------------------- > 1 file changed, 75 insertions(+), 58 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 0d258c21b9897f..a30fda1985e6af 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -347,10 +347,70 @@ xfs_file_splice_read( > return ret; > } > > +static ssize_t > +xfs_file_write_zero_eof( > + struct kiocb *iocb, > + struct iov_iter *from, > + unsigned int *iolock, > + size_t count, > + bool *drained_dio) > +{ > + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); > + loff_t isize; > + > + /* > + * We need to serialise against EOF updates that occur in IO completions > + * here. We want to make sure that nobody is changing the size while > + * we do this check until we have placed an IO barrier (i.e. hold > + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The > + * spinlock effectively forms a memory barrier once we have > + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and > + * hence be able to correctly determine if we need to run zeroing. > + */ > + spin_lock(&ip->i_flags_lock); > + isize = i_size_read(VFS_I(ip)); > + if (iocb->ki_pos <= isize) { > + spin_unlock(&ip->i_flags_lock); > + return 0; > + } > + spin_unlock(&ip->i_flags_lock); > + > + if (iocb->ki_flags & IOCB_NOWAIT) > + return -EAGAIN; > + > + if (!*drained_dio) { > + /* > + * If zeroing is needed and we are currently holding the iolock > + * shared, we need to update it to exclusive which implies > + * having to redo all checks before. > + */ > + if (*iolock == XFS_IOLOCK_SHARED) { > + xfs_iunlock(ip, *iolock); > + *iolock = XFS_IOLOCK_EXCL; > + xfs_ilock(ip, *iolock); > + iov_iter_reexpand(from, count); > + } > + > + /* > + * We now have an IO submission barrier in place, but AIO can do > + * EOF updates during IO completion and hence we now need to > + * wait for all of them to drain. Non-AIO DIO will have drained > + * before we are given the XFS_IOLOCK_EXCL, and so for most > + * cases this wait is a no-op. > + */ > + inode_dio_wait(VFS_I(ip)); > + *drained_dio = true; > + return 1; I gotta say, I'm not a big fan of the "return 1 to loop again" behavior. Can you add a comment at the top stating that this is a possible return value and why it gets returned? --D > + } > + > + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); > + return xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); > +} > + > /* > * Common pre-write limit and setup checks. > * > - * Called with the iolocked held either shared and exclusive according to > + * Called with the iolock held either shared and exclusive according to > * @iolock, and returns with it held. Might upgrade the iolock to exclusive > * if called for a direct write beyond i_size. > */ > @@ -360,13 +420,10 @@ xfs_file_write_checks( > struct iov_iter *from, > unsigned int *iolock) > { > - struct file *file = iocb->ki_filp; > - struct inode *inode = file->f_mapping->host; > - struct xfs_inode *ip = XFS_I(inode); > - ssize_t error = 0; > + struct inode *inode = iocb->ki_filp->f_mapping->host; > size_t count = iov_iter_count(from); > bool drained_dio = false; > - loff_t isize; > + ssize_t error; > > restart: > error = generic_write_checks(iocb, from); > @@ -389,7 +446,7 @@ xfs_file_write_checks( > * exclusively. > */ > if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { > - xfs_iunlock(ip, *iolock); > + xfs_iunlock(XFS_I(inode), *iolock); > *iolock = XFS_IOLOCK_EXCL; > error = xfs_ilock_iocb(iocb, *iolock); > if (error) { > @@ -400,64 +457,24 @@ xfs_file_write_checks( > } > > /* > - * If the offset is beyond the size of the file, we need to zero any > + * If the offset is beyond the size of the file, we need to zero all > * blocks that fall between the existing EOF and the start of this > - * write. If zeroing is needed and we are currently holding the iolock > - * shared, we need to update it to exclusive which implies having to > - * redo all checks before. > - * > - * We need to serialise against EOF updates that occur in IO completions > - * here. We want to make sure that nobody is changing the size while we > - * do this check until we have placed an IO barrier (i.e. hold the > - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The > - * spinlock effectively forms a memory barrier once we have the > - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and > - * hence be able to correctly determine if we need to run zeroing. > + * write. > * > - * We can do an unlocked check here safely as IO completion can only > - * extend EOF. Truncate is locked out at this point, so the EOF can > - * not move backwards, only forwards. Hence we only need to take the > - * slow path and spin locks when we are at or beyond the current EOF. > + * We can do an unlocked check for i_size here safely as I/O completion > + * can only extend EOF. Truncate is locked out at this point, so the > + * EOF can not move backwards, only forwards. Hence we only need to take > + * the slow path when we are at or beyond the current EOF. > */ > - if (iocb->ki_pos <= i_size_read(inode)) > - goto out; > - > - spin_lock(&ip->i_flags_lock); > - isize = i_size_read(inode); > - if (iocb->ki_pos > isize) { > - spin_unlock(&ip->i_flags_lock); > - > - if (iocb->ki_flags & IOCB_NOWAIT) > - return -EAGAIN; > - > - if (!drained_dio) { > - if (*iolock == XFS_IOLOCK_SHARED) { > - xfs_iunlock(ip, *iolock); > - *iolock = XFS_IOLOCK_EXCL; > - xfs_ilock(ip, *iolock); > - iov_iter_reexpand(from, count); > - } > - /* > - * We now have an IO submission barrier in place, but > - * AIO can do EOF updates during IO completion and hence > - * we now need to wait for all of them to drain. Non-AIO > - * DIO will have drained before we are given the > - * XFS_IOLOCK_EXCL, and so for most cases this wait is a > - * no-op. > - */ > - inode_dio_wait(inode); > - drained_dio = true; > + if (iocb->ki_pos > i_size_read(inode)) { > + error = xfs_file_write_zero_eof(iocb, from, iolock, count, > + &drained_dio); > + if (error == 1) > goto restart; > - } > - > - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); > - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); > if (error) > return error; > - } else > - spin_unlock(&ip->i_flags_lock); > + } > > -out: > return kiocb_modified(iocb); > } > > -- > 2.45.2 > >
On Tue, Sep 17, 2024 at 02:14:19PM -0700, Darrick J. Wong wrote: > I gotta say, I'm not a big fan of the "return 1 to loop again" behavior. > Can you add a comment at the top stating that this is a possible return > value and why it gets returned? Sure. If you have a better idea I'm all ears, too.
On Wed, Sep 18, 2024 at 07:09:36AM +0200, Christoph Hellwig wrote: > On Tue, Sep 17, 2024 at 02:14:19PM -0700, Darrick J. Wong wrote: > > I gotta say, I'm not a big fan of the "return 1 to loop again" behavior. > > Can you add a comment at the top stating that this is a possible return > > value and why it gets returned? > > Sure. If you have a better idea I'm all ears, too. Sadly, I don't. The closest I can think of is that iterator functions return 1 for "here's an object", 0 for "no more objects", or negative errno. But this isn't iterating objects, it's (at best) "iterating" the "not yet ready to do zeroing" states, and that's a stretch. --D
On Wed, Sep 18, 2024 at 07:09:36AM +0200, Christoph Hellwig wrote: > On Tue, Sep 17, 2024 at 02:14:19PM -0700, Darrick J. Wong wrote: > > I gotta say, I'm not a big fan of the "return 1 to loop again" behavior. > > Can you add a comment at the top stating that this is a possible return > > value and why it gets returned? > > Sure. If you have a better idea I'm all ears, too. I would have thought a -EBUSY error would have been appropriate. i.e. there was an extending write in progress (busy doing IO) so we couldn't perform the zeroing operation and hence the write needs to be restarted now the IO has been drained... -Dave.
On Fri, Sep 20, 2024 at 10:25:54AM +1000, Dave Chinner wrote: > > Sure. If you have a better idea I'm all ears, too. > > I would have thought a -EBUSY error would have been appropriate. > i.e. there was an extending write in progress (busy doing IO) so we > couldn't perform the zeroing operation and hence the write needs to > be restarted now the IO has been drained... I can't say that I'm a huge fan of overloading errno values when there is quite a few call that could return basically arbitrary errors in the chain. See the "fix a DEBUG-only assert failure in xfs/538" series for when this kind of errno overloading causes problems later on in unexpected ways.
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 0d258c21b9897f..a30fda1985e6af 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -347,10 +347,70 @@ xfs_file_splice_read( return ret; } +static ssize_t +xfs_file_write_zero_eof( + struct kiocb *iocb, + struct iov_iter *from, + unsigned int *iolock, + size_t count, + bool *drained_dio) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + loff_t isize; + + /* + * We need to serialise against EOF updates that occur in IO completions + * here. We want to make sure that nobody is changing the size while + * we do this check until we have placed an IO barrier (i.e. hold + * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The + * spinlock effectively forms a memory barrier once we have + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and + * hence be able to correctly determine if we need to run zeroing. + */ + spin_lock(&ip->i_flags_lock); + isize = i_size_read(VFS_I(ip)); + if (iocb->ki_pos <= isize) { + spin_unlock(&ip->i_flags_lock); + return 0; + } + spin_unlock(&ip->i_flags_lock); + + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + if (!*drained_dio) { + /* + * If zeroing is needed and we are currently holding the iolock + * shared, we need to update it to exclusive which implies + * having to redo all checks before. + */ + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } + + /* + * We now have an IO submission barrier in place, but AIO can do + * EOF updates during IO completion and hence we now need to + * wait for all of them to drain. Non-AIO DIO will have drained + * before we are given the XFS_IOLOCK_EXCL, and so for most + * cases this wait is a no-op. + */ + inode_dio_wait(VFS_I(ip)); + *drained_dio = true; + return 1; + } + + trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); + return xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); +} + /* * Common pre-write limit and setup checks. * - * Called with the iolocked held either shared and exclusive according to + * Called with the iolock held either shared and exclusive according to * @iolock, and returns with it held. Might upgrade the iolock to exclusive * if called for a direct write beyond i_size. */ @@ -360,13 +420,10 @@ xfs_file_write_checks( struct iov_iter *from, unsigned int *iolock) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - ssize_t error = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); bool drained_dio = false; - loff_t isize; + ssize_t error; restart: error = generic_write_checks(iocb, from); @@ -389,7 +446,7 @@ xfs_file_write_checks( * exclusively. */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_iunlock(ip, *iolock); + xfs_iunlock(XFS_I(inode), *iolock); *iolock = XFS_IOLOCK_EXCL; error = xfs_ilock_iocb(iocb, *iolock); if (error) { @@ -400,64 +457,24 @@ xfs_file_write_checks( } /* - * If the offset is beyond the size of the file, we need to zero any + * If the offset is beyond the size of the file, we need to zero all * blocks that fall between the existing EOF and the start of this - * write. If zeroing is needed and we are currently holding the iolock - * shared, we need to update it to exclusive which implies having to - * redo all checks before. - * - * We need to serialise against EOF updates that occur in IO completions - * here. We want to make sure that nobody is changing the size while we - * do this check until we have placed an IO barrier (i.e. hold the - * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The - * spinlock effectively forms a memory barrier once we have the - * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and - * hence be able to correctly determine if we need to run zeroing. + * write. * - * We can do an unlocked check here safely as IO completion can only - * extend EOF. Truncate is locked out at this point, so the EOF can - * not move backwards, only forwards. Hence we only need to take the - * slow path and spin locks when we are at or beyond the current EOF. + * We can do an unlocked check for i_size here safely as I/O completion + * can only extend EOF. Truncate is locked out at this point, so the + * EOF can not move backwards, only forwards. Hence we only need to take + * the slow path when we are at or beyond the current EOF. */ - if (iocb->ki_pos <= i_size_read(inode)) - goto out; - - spin_lock(&ip->i_flags_lock); - isize = i_size_read(inode); - if (iocb->ki_pos > isize) { - spin_unlock(&ip->i_flags_lock); - - if (iocb->ki_flags & IOCB_NOWAIT) - return -EAGAIN; - - if (!drained_dio) { - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - } - /* - * We now have an IO submission barrier in place, but - * AIO can do EOF updates during IO completion and hence - * we now need to wait for all of them to drain. Non-AIO - * DIO will have drained before we are given the - * XFS_IOLOCK_EXCL, and so for most cases this wait is a - * no-op. - */ - inode_dio_wait(inode); - drained_dio = true; + if (iocb->ki_pos > i_size_read(inode)) { + error = xfs_file_write_zero_eof(iocb, from, iolock, count, + &drained_dio); + if (error == 1) goto restart; - } - - trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); if (error) return error; - } else - spin_unlock(&ip->i_flags_lock); + } -out: return kiocb_modified(iocb); }
Split a helper from xfs_file_write_checks that just deal with the post-EOF zeroing to keep the code readable. Signed-off-by: Christoph Hellwig <hch@lst.de> --- fs/xfs/xfs_file.c | 133 ++++++++++++++++++++++++++-------------------- 1 file changed, 75 insertions(+), 58 deletions(-)