Message ID | eae9d2125de1887f55186668937df7475b0a33f4.1678977084.git.ritesh.list@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | [RFCv1,WIP] ext2: Move direct-io to use iomap | expand |
On Thu, Mar 16, 2023 at 08:10:29PM +0530, Ritesh Harjani (IBM) wrote: > [DO NOT MERGE] [WORK-IN-PROGRESS] > > Hello Jan, > > This is an initial version of the patch set which I wanted to share > before today's call. This is still work in progress but atleast passes > the set of test cases which I had kept for dio testing (except 1 from my > list). > > Looks like there won't be much/any changes required from iomap side to > support ext2 moving to iomap apis. > > I will be doing some more testing specifically test generic/083 which is > occassionally failing in my testing. > Also once this is stabilized, I can do some performance testing too if you > feel so. Last I remembered we saw some performance regressions when ext4 > moved to iomap for dio. > > PS: Please ignore if there are some silly mistakes. As I said, I wanted > to get this out before today's discussion. :) > > Thanks for your help!! > > Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> > --- > fs/ext2/ext2.h | 1 + > fs/ext2/file.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ > fs/ext2/inode.c | 20 +-------- > 3 files changed, 117 insertions(+), 18 deletions(-) > > diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h > index cb78d7dcfb95..cb5e309fe040 100644 > --- a/fs/ext2/ext2.h > +++ b/fs/ext2/ext2.h > @@ -753,6 +753,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); > extern struct inode *ext2_iget (struct super_block *, unsigned long); > extern int ext2_write_inode (struct inode *, struct writeback_control *); > extern void ext2_evict_inode(struct inode *); > +extern void ext2_write_failed(struct address_space *mapping, loff_t to); > extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); > extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); > extern int ext2_getattr (struct mnt_idmap *, const struct path *, > diff --git a/fs/ext2/file.c b/fs/ext2/file.c > index 6b4bebe982ca..7a8561304559 100644 > --- a/fs/ext2/file.c > +++ b/fs/ext2/file.c > @@ -161,12 +161,123 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) > return ret; > } > > +static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > +{ > + struct file *file = iocb->ki_filp; > + struct inode *inode = file->f_mapping->host; > + ssize_t ret; > + > + inode_lock_shared(inode); > + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); > + inode_unlock_shared(inode); > + > + return ret; > +} > + > +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, > + int error, unsigned int flags) > +{ > + loff_t pos = iocb->ki_pos; > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (error) > + return error; > + > + pos += size; > + if (pos > i_size_read(inode)) > + i_size_write(inode, pos); > + > + return 0; > +} > + > +static const struct iomap_dio_ops ext2_dio_write_ops = { > + .end_io = ext2_dio_write_end_io, > +}; > + > +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) > +{ > + struct file *file = iocb->ki_filp; > + struct inode *inode = file->f_mapping->host; > + ssize_t ret; > + unsigned int flags; > + unsigned long blocksize = inode->i_sb->s_blocksize; > + loff_t offset = iocb->ki_pos; > + loff_t count = iov_iter_count(from); > + > + > + inode_lock(inode); > + ret = generic_write_checks(iocb, from); > + if (ret <= 0) > + goto out_unlock; > + ret = file_remove_privs(file); > + if (ret) > + goto out_unlock; > + ret = file_update_time(file); > + if (ret) > + goto out_unlock; kiocb_modified() instead of calling file_remove_privs? > + > + /* > + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() > + * calls for generic_write_sync in iomap_dio_complete(). > + * Since ext2_fsync nmust be called w/o inode lock, > + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() > + * ourselves. > + */ > + flags = IOMAP_DIO_NOSYNC; > + > + /* use IOMAP_DIO_FORCE_WAIT for unaligned of extending writes */ > + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || > + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) > + flags |= IOMAP_DIO_FORCE_WAIT; > + > + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, > + flags, NULL, 0); > + > + if (ret == -ENOTBLK) > + ret = 0; > + > + if (ret < 0 && ret != -EIOCBQUEUED) > + ext2_write_failed(inode->i_mapping, offset + count); > + > + /* handle case for partial write or fallback to buffered write */ > + if (ret >= 0 && iov_iter_count(from)) { > + loff_t pos, endbyte; > + ssize_t status; > + ssize_t ret2; > + > + pos = iocb->ki_pos; > + status = generic_perform_write(iocb, from); > + if (unlikely(status < 0)) { > + ret = status; > + goto out_unlock; > + } > + endbyte = pos + status - 1; > + ret2 = filemap_write_and_wait_range(inode->i_mapping, pos, > + endbyte); > + if (ret2 == 0) { > + iocb->ki_pos = endbyte + 1; > + ret += status; > + invalidate_mapping_pages(inode->i_mapping, > + pos >> PAGE_SHIFT, > + endbyte >> PAGE_SHIFT); > + } > + } (Why not fall back to the actual buffered write path?) Otherwise this looks like a reasonable first start. --D > +out_unlock: > + inode_unlock(inode); > + if (ret > 0) > + ret = generic_write_sync(iocb, ret); > + return ret; > +} > + > static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) > { > #ifdef CONFIG_FS_DAX > if (IS_DAX(iocb->ki_filp->f_mapping->host)) > return ext2_dax_read_iter(iocb, to); > #endif > + if (iocb->ki_flags & IOCB_DIRECT) > + return ext2_dio_read_iter(iocb, to); > + > return generic_file_read_iter(iocb, to); > } > > @@ -176,6 +287,9 @@ static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) > if (IS_DAX(iocb->ki_filp->f_mapping->host)) > return ext2_dax_write_iter(iocb, from); > #endif > + if (iocb->ki_flags & IOCB_DIRECT) > + return ext2_dio_write_iter(iocb, from); > + > return generic_file_write_iter(iocb, from); > } > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index 26f135e7ffce..7ff669d0b6d2 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -56,7 +56,7 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) > > static void ext2_truncate_blocks(struct inode *inode, loff_t offset); > > -static void ext2_write_failed(struct address_space *mapping, loff_t to) > +void ext2_write_failed(struct address_space *mapping, loff_t to) > { > struct inode *inode = mapping->host; > > @@ -908,22 +908,6 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) > return generic_block_bmap(mapping,block,ext2_get_block); > } > > -static ssize_t > -ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) > -{ > - struct file *file = iocb->ki_filp; > - struct address_space *mapping = file->f_mapping; > - struct inode *inode = mapping->host; > - size_t count = iov_iter_count(iter); > - loff_t offset = iocb->ki_pos; > - ssize_t ret; > - > - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); > - if (ret < 0 && iov_iter_rw(iter) == WRITE) > - ext2_write_failed(mapping, offset + count); > - return ret; > -} > - > static int > ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) > { > @@ -946,7 +930,7 @@ const struct address_space_operations ext2_aops = { > .write_begin = ext2_write_begin, > .write_end = ext2_write_end, > .bmap = ext2_bmap, > - .direct_IO = ext2_direct_IO, > + .direct_IO = noop_direct_IO, > .writepages = ext2_writepages, > .migrate_folio = buffer_migrate_folio, > .is_partially_uptodate = block_is_partially_uptodate, > -- > 2.39.2 >
On Thu, Mar 16, 2023 at 08:10:29PM +0530, Ritesh Harjani (IBM) wrote: > +extern void ext2_write_failed(struct address_space *mapping, loff_t to); Nit: please don't bother with extents for function prototypes. > +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, > + int error, unsigned int flags) > +{ > + loff_t pos = iocb->ki_pos; > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (error) > + return error; > + > + pos += size; > + if (pos > i_size_read(inode)) > + i_size_write(inode, pos); Doesn't i_size_write need i_mutex protection? > + /* > + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() > + * calls for generic_write_sync in iomap_dio_complete(). > + * Since ext2_fsync nmust be called w/o inode lock, > + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() > + * ourselves. > + */ > + flags = IOMAP_DIO_NOSYNC; So we added IOMAP_DIO_NOSYNC for btrfs initially, but even btrfs did not manage to mak it work. I suspect the right thing here as well is to call __iomap_dio_rw and then separately after dropping the lock. Without that you for example can't take i_mutex in the end_io handler, which I think we need to do. We should then remove IOMAP_DIO_NOSYNC again.
"Darrick J. Wong" <djwong@kernel.org> writes: > On Thu, Mar 16, 2023 at 08:10:29PM +0530, Ritesh Harjani (IBM) wrote: >> [DO NOT MERGE] [WORK-IN-PROGRESS] >> >> Hello Jan, >> >> This is an initial version of the patch set which I wanted to share >> before today's call. This is still work in progress but atleast passes >> the set of test cases which I had kept for dio testing (except 1 from my >> list). >> >> Looks like there won't be much/any changes required from iomap side to >> support ext2 moving to iomap apis. >> >> I will be doing some more testing specifically test generic/083 which is >> occassionally failing in my testing. >> Also once this is stabilized, I can do some performance testing too if you >> feel so. Last I remembered we saw some performance regressions when ext4 >> moved to iomap for dio. >> >> PS: Please ignore if there are some silly mistakes. As I said, I wanted >> to get this out before today's discussion. :) >> >> Thanks for your help!! >> >> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> >> --- >> fs/ext2/ext2.h | 1 + >> fs/ext2/file.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ >> fs/ext2/inode.c | 20 +-------- >> 3 files changed, 117 insertions(+), 18 deletions(-) >> >> diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h >> index cb78d7dcfb95..cb5e309fe040 100644 >> --- a/fs/ext2/ext2.h >> +++ b/fs/ext2/ext2.h >> @@ -753,6 +753,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); >> extern struct inode *ext2_iget (struct super_block *, unsigned long); >> extern int ext2_write_inode (struct inode *, struct writeback_control *); >> extern void ext2_evict_inode(struct inode *); >> +extern void ext2_write_failed(struct address_space *mapping, loff_t to); >> extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); >> extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); >> extern int ext2_getattr (struct mnt_idmap *, const struct path *, >> diff --git a/fs/ext2/file.c b/fs/ext2/file.c >> index 6b4bebe982ca..7a8561304559 100644 >> --- a/fs/ext2/file.c >> +++ b/fs/ext2/file.c >> @@ -161,12 +161,123 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) >> return ret; >> } >> >> +static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >> +{ >> + struct file *file = iocb->ki_filp; >> + struct inode *inode = file->f_mapping->host; >> + ssize_t ret; >> + >> + inode_lock_shared(inode); >> + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); >> + inode_unlock_shared(inode); >> + >> + return ret; >> +} >> + >> +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, >> + int error, unsigned int flags) >> +{ >> + loff_t pos = iocb->ki_pos; >> + struct inode *inode = file_inode(iocb->ki_filp); >> + >> + if (error) >> + return error; >> + >> + pos += size; >> + if (pos > i_size_read(inode)) >> + i_size_write(inode, pos); >> + >> + return 0; >> +} >> + >> +static const struct iomap_dio_ops ext2_dio_write_ops = { >> + .end_io = ext2_dio_write_end_io, >> +}; >> + >> +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) >> +{ >> + struct file *file = iocb->ki_filp; >> + struct inode *inode = file->f_mapping->host; >> + ssize_t ret; >> + unsigned int flags; >> + unsigned long blocksize = inode->i_sb->s_blocksize; >> + loff_t offset = iocb->ki_pos; >> + loff_t count = iov_iter_count(from); >> + >> + >> + inode_lock(inode); >> + ret = generic_write_checks(iocb, from); >> + if (ret <= 0) >> + goto out_unlock; >> + ret = file_remove_privs(file); >> + if (ret) >> + goto out_unlock; >> + ret = file_update_time(file); >> + if (ret) >> + goto out_unlock; > > kiocb_modified() instead of calling file_remove_privs? Yes, looks likle it is a replacement for file_remove_privs and file_update_time(). >> + >> + /* >> + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() >> + * calls for generic_write_sync in iomap_dio_complete(). >> + * Since ext2_fsync nmust be called w/o inode lock, >> + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() >> + * ourselves. >> + */ >> + flags = IOMAP_DIO_NOSYNC; >> + >> + /* use IOMAP_DIO_FORCE_WAIT for unaligned of extending writes */ >> + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || >> + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) >> + flags |= IOMAP_DIO_FORCE_WAIT; >> + >> + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, >> + flags, NULL, 0); >> + >> + if (ret == -ENOTBLK) >> + ret = 0; >> + >> + if (ret < 0 && ret != -EIOCBQUEUED) >> + ext2_write_failed(inode->i_mapping, offset + count); >> + >> + /* handle case for partial write or fallback to buffered write */ >> + if (ret >= 0 && iov_iter_count(from)) { >> + loff_t pos, endbyte; >> + ssize_t status; >> + ssize_t ret2; >> + >> + pos = iocb->ki_pos; >> + status = generic_perform_write(iocb, from); >> + if (unlikely(status < 0)) { >> + ret = status; >> + goto out_unlock; >> + } >> + endbyte = pos + status - 1; >> + ret2 = filemap_write_and_wait_range(inode->i_mapping, pos, >> + endbyte); >> + if (ret2 == 0) { >> + iocb->ki_pos = endbyte + 1; >> + ret += status; >> + invalidate_mapping_pages(inode->i_mapping, >> + pos >> PAGE_SHIFT, >> + endbyte >> PAGE_SHIFT); >> + } >> + } > > (Why not fall back to the actual buffered write path?) > Because then we can handle everything related to DIO in ext4_dio_file_write() itself e.g. As per the semantics of DIO we should ensure that page-cache pages are written to disk and invalidated before returning (filemap_write_and_wait_range() and invalidate_mapping_pages()) > Otherwise this looks like a reasonable first start. Thanks! -ritesh
On Thu 16-03-23 20:10:29, Ritesh Harjani (IBM) wrote: > [DO NOT MERGE] [WORK-IN-PROGRESS] > > Hello Jan, > > This is an initial version of the patch set which I wanted to share > before today's call. This is still work in progress but atleast passes > the set of test cases which I had kept for dio testing (except 1 from my > list). > > Looks like there won't be much/any changes required from iomap side to > support ext2 moving to iomap apis. > > I will be doing some more testing specifically test generic/083 which is > occassionally failing in my testing. > Also once this is stabilized, I can do some performance testing too if you > feel so. Last I remembered we saw some performance regressions when ext4 > moved to iomap for dio. > > PS: Please ignore if there are some silly mistakes. As I said, I wanted > to get this out before today's discussion. :) > > Thanks for your help!! > > Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> > --- > fs/ext2/ext2.h | 1 + > fs/ext2/file.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ > fs/ext2/inode.c | 20 +-------- > 3 files changed, 117 insertions(+), 18 deletions(-) > > diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h > index cb78d7dcfb95..cb5e309fe040 100644 > --- a/fs/ext2/ext2.h > +++ b/fs/ext2/ext2.h > @@ -753,6 +753,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); > extern struct inode *ext2_iget (struct super_block *, unsigned long); > extern int ext2_write_inode (struct inode *, struct writeback_control *); > extern void ext2_evict_inode(struct inode *); > +extern void ext2_write_failed(struct address_space *mapping, loff_t to); > extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); > extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); > extern int ext2_getattr (struct mnt_idmap *, const struct path *, > diff --git a/fs/ext2/file.c b/fs/ext2/file.c > index 6b4bebe982ca..7a8561304559 100644 > --- a/fs/ext2/file.c > +++ b/fs/ext2/file.c > @@ -161,12 +161,123 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) > return ret; > } > > +static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) > +{ > + struct file *file = iocb->ki_filp; > + struct inode *inode = file->f_mapping->host; > + ssize_t ret; > + > + inode_lock_shared(inode); > + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); > + inode_unlock_shared(inode); > + > + return ret; > +} > + > +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, > + int error, unsigned int flags) > +{ > + loff_t pos = iocb->ki_pos; > + struct inode *inode = file_inode(iocb->ki_filp); > + > + if (error) > + return error; > + I guess you should carry over here relevant bits of the comment from ext4_dio_write_end_io() explaining that doing i_size update here is necessary and actually safe. > + pos += size; > + if (pos > i_size_read(inode)) > + i_size_write(inode, pos); > + > + return 0; > +} > + > +static const struct iomap_dio_ops ext2_dio_write_ops = { > + .end_io = ext2_dio_write_end_io, > +}; > + > +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) > +{ > + struct file *file = iocb->ki_filp; > + struct inode *inode = file->f_mapping->host; > + ssize_t ret; > + unsigned int flags; > + unsigned long blocksize = inode->i_sb->s_blocksize; > + loff_t offset = iocb->ki_pos; > + loff_t count = iov_iter_count(from); > + > + > + inode_lock(inode); > + ret = generic_write_checks(iocb, from); > + if (ret <= 0) > + goto out_unlock; > + ret = file_remove_privs(file); > + if (ret) > + goto out_unlock; > + ret = file_update_time(file); > + if (ret) > + goto out_unlock; > + > + /* > + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() > + * calls for generic_write_sync in iomap_dio_complete(). > + * Since ext2_fsync nmust be called w/o inode lock, > + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() > + * ourselves. > + */ > + flags = IOMAP_DIO_NOSYNC; Meh, this is kind of ugly and we should come up with something better for simple filesystems so that they don't have to play these games. Frankly, these days I doubt there's anybody really needing inode_lock in __generic_file_fsync(). Neither sync_mapping_buffers() nor sync_inode_metadata() need inode_lock for their self-consistency. So it is only about flushing more consistent set of metadata to disk when fsync(2) races with other write(2)s to the same file so after a crash we have higher chances of seeing some real state of the file. But I'm not sure it's really worth keeping for filesystems that are still using sync_mapping_buffers(). People that care about consistency after a crash have IMHO moved to other filesystems long ago. > + > + /* use IOMAP_DIO_FORCE_WAIT for unaligned of extending writes */ ^^ or > + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || > + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) > + flags |= IOMAP_DIO_FORCE_WAIT; > + > + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, > + flags, NULL, 0); > + > + if (ret == -ENOTBLK) > + ret = 0; So iomap_dio_rw() doesn't have the DIO_SKIP_HOLES behavior of blockdev_direct_IO(). Thus you have to implement that in your ext2_iomap_ops, in particular in iomap_begin... Honza
Thanks Jan for your feedback! Jan Kara <jack@suse.cz> writes: > On Thu 16-03-23 20:10:29, Ritesh Harjani (IBM) wrote: >> [DO NOT MERGE] [WORK-IN-PROGRESS] >> >> Hello Jan, >> >> This is an initial version of the patch set which I wanted to share >> before today's call. This is still work in progress but atleast passes >> the set of test cases which I had kept for dio testing (except 1 from my >> list). >> >> Looks like there won't be much/any changes required from iomap side to >> support ext2 moving to iomap apis. >> >> I will be doing some more testing specifically test generic/083 which is >> occassionally failing in my testing. >> Also once this is stabilized, I can do some performance testing too if you >> feel so. Last I remembered we saw some performance regressions when ext4 >> moved to iomap for dio. >> >> PS: Please ignore if there are some silly mistakes. As I said, I wanted >> to get this out before today's discussion. :) >> >> Thanks for your help!! >> >> Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> >> --- >> fs/ext2/ext2.h | 1 + >> fs/ext2/file.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ >> fs/ext2/inode.c | 20 +-------- >> 3 files changed, 117 insertions(+), 18 deletions(-) >> >> diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h >> index cb78d7dcfb95..cb5e309fe040 100644 >> --- a/fs/ext2/ext2.h >> +++ b/fs/ext2/ext2.h >> @@ -753,6 +753,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); >> extern struct inode *ext2_iget (struct super_block *, unsigned long); >> extern int ext2_write_inode (struct inode *, struct writeback_control *); >> extern void ext2_evict_inode(struct inode *); >> +extern void ext2_write_failed(struct address_space *mapping, loff_t to); >> extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); >> extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); >> extern int ext2_getattr (struct mnt_idmap *, const struct path *, >> diff --git a/fs/ext2/file.c b/fs/ext2/file.c >> index 6b4bebe982ca..7a8561304559 100644 >> --- a/fs/ext2/file.c >> +++ b/fs/ext2/file.c >> @@ -161,12 +161,123 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) >> return ret; >> } >> >> +static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) >> +{ >> + struct file *file = iocb->ki_filp; >> + struct inode *inode = file->f_mapping->host; >> + ssize_t ret; >> + >> + inode_lock_shared(inode); >> + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); >> + inode_unlock_shared(inode); >> + >> + return ret; >> +} >> + >> +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, >> + int error, unsigned int flags) >> +{ >> + loff_t pos = iocb->ki_pos; >> + struct inode *inode = file_inode(iocb->ki_filp); >> + >> + if (error) >> + return error; >> + > > I guess you should carry over here relevant bits of the comment from > ext4_dio_write_end_io() explaining that doing i_size update here is > necessary and actually safe. > Yes, sure. Will do that in the next rev. >> + pos += size; >> + if (pos > i_size_read(inode)) >> + i_size_write(inode, pos); >> + >> + return 0; >> +} >> + >> +static const struct iomap_dio_ops ext2_dio_write_ops = { >> + .end_io = ext2_dio_write_end_io, >> +}; >> + >> +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) >> +{ >> + struct file *file = iocb->ki_filp; >> + struct inode *inode = file->f_mapping->host; >> + ssize_t ret; >> + unsigned int flags; >> + unsigned long blocksize = inode->i_sb->s_blocksize; >> + loff_t offset = iocb->ki_pos; >> + loff_t count = iov_iter_count(from); >> + >> + >> + inode_lock(inode); >> + ret = generic_write_checks(iocb, from); >> + if (ret <= 0) >> + goto out_unlock; >> + ret = file_remove_privs(file); >> + if (ret) >> + goto out_unlock; >> + ret = file_update_time(file); >> + if (ret) >> + goto out_unlock; >> + >> + /* >> + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() >> + * calls for generic_write_sync in iomap_dio_complete(). >> + * Since ext2_fsync nmust be called w/o inode lock, >> + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() >> + * ourselves. >> + */ >> + flags = IOMAP_DIO_NOSYNC; > > Meh, this is kind of ugly and we should come up with something better for > simple filesystems so that they don't have to play these games. Frankly, > these days I doubt there's anybody really needing inode_lock in > __generic_file_fsync(). Neither sync_mapping_buffers() nor > sync_inode_metadata() need inode_lock for their self-consistency. So it is > only about flushing more consistent set of metadata to disk when fsync(2) > races with other write(2)s to the same file so after a crash we have higher > chances of seeing some real state of the file. But I'm not sure it's really > worth keeping for filesystems that are still using sync_mapping_buffers(). > People that care about consistency after a crash have IMHO moved to other > filesystems long ago. > One way which hch is suggesting is to use __iomap_dio_rw() -> unlock inode -> call generic_write_sync(). I haven't yet worked on this part. Are you suggesting to rip of inode_lock from __generic_file_fsync()? Won't it have a much larger implications? >> + >> + /* use IOMAP_DIO_FORCE_WAIT for unaligned of extending writes */ > ^^ or > Yup. will fix it. >> + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || >> + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) >> + flags |= IOMAP_DIO_FORCE_WAIT; >> + >> + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, >> + flags, NULL, 0); >> + >> + if (ret == -ENOTBLK) >> + ret = 0; > > So iomap_dio_rw() doesn't have the DIO_SKIP_HOLES behavior of > blockdev_direct_IO(). Thus you have to implement that in your > ext2_iomap_ops, in particular in iomap_begin... > Aah yes. Thanks for pointing that out - ext2_iomap_begin() should have something like this - /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ Also I think ext2_iomap_end() should also handle a case like in ext4 - /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code so that we * fallback to buffered I/O and attempt to complete the remainder of * the I/O. Any blocks that may have been allocated in preparation for * the direct I/O will be reused during buffered I/O. */ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) return -ENOTBLK; I am wondering if we have testcases in xfstests which really tests these functionalities also or not? Let me give it a try... ... So I did and somehow couldn't find any testcase which fails w/o above changes. I also ran LTP dio tests (./runltp -f dio -d /mnt1/scratch/tmp/) with the current patches on ext2 and all the tests passed. So it seems LTP doesn't have tests to trigger stale data exposure problems. Do you know of any test case which could trigger this? Otherwise I can finish updating the patches and then work on writing some test cases to trigger this. Another query - We have this function ext2_iomap_end() (pasted below) which calls ext2_write_failed(). Here IMO two cases are possible - 1. written is 0. which means an error has occurred. In that case calling ext2_write_failed() make sense. 2. consider a case where written > 0 && written < length. (This is possible right?). In that case we still go and call ext2_write_failed(). This function will truncate the pagecache and disk blocks beyong i_size. Now we haven't yet updated inode->i_size (we do that in ->end_io which gets called in the end during completion) So that means it just removes everything. Then in ext2_dax_write_iter(), we might go and update inode->i_size to iocb->ki_pos including for short writes. This looks like it isn't consistent because earlier we had destroyed all the blocks for the short writes and we will be returning ret > 0 to the user saying these many bytes have been written. Again I haven't yet found a test case at least not in xfstests which can trigger this short writes. Let me know your thoughts on this. All of this lies on the fact that there can be a case where written > 0 && written < length. I will read more to see if this even happens or not. But I atleast wanted to capture this somewhere. static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) ext2_write_failed(inode->i_mapping, offset + length); return 0; } void ext2_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); ext2_truncate_blocks(inode, inode->i_size); } } ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) <...> ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); } <...> Another thing - In dax while truncating the inode i_size in ext2_setsize(), I think we don't properly call dax_zero_blocks() when we are trying to zero the last block beyond EOF. i.e. for e.g. it can be called with len as 0 if newsize is page_aligned. It then will call ext2_get_blocks() with len = 0 and can bug_on at maxblocks == 0. I think it should be this. I will spend some more time analyzing this and also test it once against DAX paths. diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 7ff669d0b6d2..cc264b1e288c 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1243,9 +1243,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); if (IS_DAX(inode)) - error = dax_zero_range(inode, newsize, - PAGE_ALIGN(newsize) - newsize, NULL, - &ext2_iomap_ops); + error = dax_truncate_page(inode, newsize, NULL, + &ext2_iomap_ops); else error = block_truncate_page(inode->i_mapping, newsize, ext2_get_block); -ritesh > Honza > -- > Jan Kara <jack@suse.com> > SUSE Labs, CR
On Wed 22-03-23 12:04:01, Ritesh Harjani wrote: > Jan Kara <jack@suse.cz> writes: > >> + pos += size; > >> + if (pos > i_size_read(inode)) > >> + i_size_write(inode, pos); > >> + > >> + return 0; > >> +} > >> + > >> +static const struct iomap_dio_ops ext2_dio_write_ops = { > >> + .end_io = ext2_dio_write_end_io, > >> +}; > >> + > >> +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) > >> +{ > >> + struct file *file = iocb->ki_filp; > >> + struct inode *inode = file->f_mapping->host; > >> + ssize_t ret; > >> + unsigned int flags; > >> + unsigned long blocksize = inode->i_sb->s_blocksize; > >> + loff_t offset = iocb->ki_pos; > >> + loff_t count = iov_iter_count(from); > >> + > >> + > >> + inode_lock(inode); > >> + ret = generic_write_checks(iocb, from); > >> + if (ret <= 0) > >> + goto out_unlock; > >> + ret = file_remove_privs(file); > >> + if (ret) > >> + goto out_unlock; > >> + ret = file_update_time(file); > >> + if (ret) > >> + goto out_unlock; > >> + > >> + /* > >> + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() > >> + * calls for generic_write_sync in iomap_dio_complete(). > >> + * Since ext2_fsync nmust be called w/o inode lock, > >> + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() > >> + * ourselves. > >> + */ > >> + flags = IOMAP_DIO_NOSYNC; > > > > Meh, this is kind of ugly and we should come up with something better for > > simple filesystems so that they don't have to play these games. Frankly, > > these days I doubt there's anybody really needing inode_lock in > > __generic_file_fsync(). Neither sync_mapping_buffers() nor > > sync_inode_metadata() need inode_lock for their self-consistency. So it is > > only about flushing more consistent set of metadata to disk when fsync(2) > > races with other write(2)s to the same file so after a crash we have higher > > chances of seeing some real state of the file. But I'm not sure it's really > > worth keeping for filesystems that are still using sync_mapping_buffers(). > > People that care about consistency after a crash have IMHO moved to other > > filesystems long ago. > > > > One way which hch is suggesting is to use __iomap_dio_rw() -> unlock > inode -> call generic_write_sync(). I haven't yet worked on this part. So I see two problems with what Christoph suggests: a) It is unfortunate API design to require trivial (and low maintenance) filesystem to do these relatively complex locking games. But this can be solved by providing appropriate wrapper for them I guess. b) When you unlock the inode, other stuff can happen with the inode. And e.g. i_size update needs to happen after IO is completed so filesystems would have to be taught to avoid say two racing expanding writes. That's IMHO really too much to ask. > Are you suggesting to rip of inode_lock from __generic_file_fsync()? > Won't it have a much larger implications? Yes and yes :). But inode writeback already happens from other paths without inode_lock so there's hardly any surprise there. sync_mapping_buffers() is impossible to "customize" by filesystems and the generic code is fine without inode_lock. So I have hard time imagining how any filesystem would really depend on inode_lock in this path (famous last words ;)). > >> + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || > >> + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) > >> + flags |= IOMAP_DIO_FORCE_WAIT; > >> + > >> + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, > >> + flags, NULL, 0); > >> + > >> + if (ret == -ENOTBLK) > >> + ret = 0; > > > > So iomap_dio_rw() doesn't have the DIO_SKIP_HOLES behavior of > > blockdev_direct_IO(). Thus you have to implement that in your > > ext2_iomap_ops, in particular in iomap_begin... > > > > Aah yes. Thanks for pointing that out - > ext2_iomap_begin() should have something like this - > /* > * We cannot fill holes in indirect tree based inodes as that could > * expose stale data in the case of a crash. Use the magic error code > * to fallback to buffered I/O. > */ > > Also I think ext2_iomap_end() should also handle a case like in ext4 - > > /* > * Check to see whether an error occurred while writing out the data to > * the allocated blocks. If so, return the magic error code so that we > * fallback to buffered I/O and attempt to complete the remainder of > * the I/O. Any blocks that may have been allocated in preparation for > * the direct I/O will be reused during buffered I/O. > */ > if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) > return -ENOTBLK; > > > I am wondering if we have testcases in xfstests which really tests these > functionalities also or not? Let me give it a try... > ... So I did and somehow couldn't find any testcase which fails w/o > above changes. I guess we don't. It isn't that simple (but certainly possible) to test for stale data exposure... > Another query - > > We have this function ext2_iomap_end() (pasted below) > which calls ext2_write_failed(). > > Here IMO two cases are possible - > > 1. written is 0. which means an error has occurred. > In that case calling ext2_write_failed() make sense. > > 2. consider a case where written > 0 && written < length. > (This is possible right?). In that case we still go and call > ext2_write_failed(). This function will truncate the pagecache and disk > blocks beyong i_size. Now we haven't yet updated inode->i_size (we do > that in ->end_io which gets called in the end during completion) > So that means it just removes everything. > > Then in ext2_dax_write_iter(), we might go and update inode->i_size > to iocb->ki_pos including for short writes. This looks like it isn't > consistent because earlier we had destroyed all the blocks for the short > writes and we will be returning ret > 0 to the user saying these many > bytes have been written. > Again I haven't yet found a test case at least not in xfstests which > can trigger this short writes. Let me know your thoughts on this. > All of this lies on the fact that there can be a case where > written > 0 && written < length. I will read more to see if this even > happens or not. But I atleast wanted to capture this somewhere. So as far as I remember, direct IO writes as implemented in iomap are all-or-nothing (see iomap_dio_complete()). But it would be good to assert that in ext4 code to avoid surprises if the generic code changes. > Another thing - > In dax while truncating the inode i_size in ext2_setsize(), > I think we don't properly call dax_zero_blocks() when we are trying to > zero the last block beyond EOF. i.e. for e.g. it can be called with len > as 0 if newsize is page_aligned. It then will call ext2_get_blocks() with > len = 0 and can bug_on at maxblocks == 0. How will it call ext2_get_blocks() with len == 0? AFAICS iomap_iter() will not call iomap_begin() at all if iter.len == 0. > I think it should be this. I will spend some more time analyzing this > and also test it once against DAX paths. > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index 7ff669d0b6d2..cc264b1e288c 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -1243,9 +1243,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) > inode_dio_wait(inode); > > if (IS_DAX(inode)) > - error = dax_zero_range(inode, newsize, > - PAGE_ALIGN(newsize) - newsize, NULL, > - &ext2_iomap_ops); > + error = dax_truncate_page(inode, newsize, NULL, > + &ext2_iomap_ops); > else > error = block_truncate_page(inode->i_mapping, > newsize, ext2_get_block); That being said this is indeed a nice cleanup. Honza
Jan Kara <jack@suse.cz> writes: Hi Jan, > On Wed 22-03-23 12:04:01, Ritesh Harjani wrote: >> Jan Kara <jack@suse.cz> writes: >> >> + pos += size; >> >> + if (pos > i_size_read(inode)) >> >> + i_size_write(inode, pos); >> >> + >> >> + return 0; >> >> +} >> >> + >> >> +static const struct iomap_dio_ops ext2_dio_write_ops = { >> >> + .end_io = ext2_dio_write_end_io, >> >> +}; >> >> + >> >> +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) >> >> +{ >> >> + struct file *file = iocb->ki_filp; >> >> + struct inode *inode = file->f_mapping->host; >> >> + ssize_t ret; >> >> + unsigned int flags; >> >> + unsigned long blocksize = inode->i_sb->s_blocksize; >> >> + loff_t offset = iocb->ki_pos; >> >> + loff_t count = iov_iter_count(from); >> >> + >> >> + >> >> + inode_lock(inode); >> >> + ret = generic_write_checks(iocb, from); >> >> + if (ret <= 0) >> >> + goto out_unlock; >> >> + ret = file_remove_privs(file); >> >> + if (ret) >> >> + goto out_unlock; >> >> + ret = file_update_time(file); >> >> + if (ret) >> >> + goto out_unlock; >> >> + >> >> + /* >> >> + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() >> >> + * calls for generic_write_sync in iomap_dio_complete(). >> >> + * Since ext2_fsync nmust be called w/o inode lock, >> >> + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() >> >> + * ourselves. >> >> + */ >> >> + flags = IOMAP_DIO_NOSYNC; >> > >> > Meh, this is kind of ugly and we should come up with something better for >> > simple filesystems so that they don't have to play these games. Frankly, >> > these days I doubt there's anybody really needing inode_lock in >> > __generic_file_fsync(). Neither sync_mapping_buffers() nor >> > sync_inode_metadata() need inode_lock for their self-consistency. So it is >> > only about flushing more consistent set of metadata to disk when fsync(2) >> > races with other write(2)s to the same file so after a crash we have higher >> > chances of seeing some real state of the file. But I'm not sure it's really >> > worth keeping for filesystems that are still using sync_mapping_buffers(). >> > People that care about consistency after a crash have IMHO moved to other >> > filesystems long ago. >> > >> >> One way which hch is suggesting is to use __iomap_dio_rw() -> unlock >> inode -> call generic_write_sync(). I haven't yet worked on this part. > > So I see two problems with what Christoph suggests: > > a) It is unfortunate API design to require trivial (and low maintenance) > filesystem to do these relatively complex locking games. But this can > be solved by providing appropriate wrapper for them I guess. > > b) When you unlock the inode, other stuff can happen with the inode. And > e.g. i_size update needs to happen after IO is completed so filesystems > would have to be taught to avoid say two racing expanding writes. That's > IMHO really too much to ask. > yes, that's the reason I was not touching it and I thought I will get back to it once I figure out other things. >> Are you suggesting to rip of inode_lock from __generic_file_fsync()? >> Won't it have a much larger implications? > > Yes and yes :). But inode writeback already happens from other paths > without inode_lock so there's hardly any surprise there. > sync_mapping_buffers() is impossible to "customize" by filesystems and the > generic code is fine without inode_lock. So I have hard time imagining how > any filesystem would really depend on inode_lock in this path (famous last > words ;)). > Ok sure. I will spend sometime looking into this code and history. And if everything looks good, will rip off inode_lock() from __generic_file_fsync(). >> >> + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || >> >> + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) >> >> + flags |= IOMAP_DIO_FORCE_WAIT; >> >> + >> >> + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, >> >> + flags, NULL, 0); >> >> + >> >> + if (ret == -ENOTBLK) >> >> + ret = 0; >> > >> > So iomap_dio_rw() doesn't have the DIO_SKIP_HOLES behavior of >> > blockdev_direct_IO(). Thus you have to implement that in your >> > ext2_iomap_ops, in particular in iomap_begin... >> > >> >> Aah yes. Thanks for pointing that out - >> ext2_iomap_begin() should have something like this - >> /* >> * We cannot fill holes in indirect tree based inodes as that could >> * expose stale data in the case of a crash. Use the magic error code >> * to fallback to buffered I/O. >> */ >> >> Also I think ext2_iomap_end() should also handle a case like in ext4 - >> >> /* >> * Check to see whether an error occurred while writing out the data to >> * the allocated blocks. If so, return the magic error code so that we >> * fallback to buffered I/O and attempt to complete the remainder of >> * the I/O. Any blocks that may have been allocated in preparation for >> * the direct I/O will be reused during buffered I/O. >> */ >> if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) >> return -ENOTBLK; >> >> >> I am wondering if we have testcases in xfstests which really tests these >> functionalities also or not? Let me give it a try... >> ... So I did and somehow couldn't find any testcase which fails w/o >> above changes. > > I guess we don't. It isn't that simple (but certainly possible) to test for > stale data exposure... > Yes, I am currently working on writing a aio dio test case to simulate this. AFAIU, the stale data exposure problems with non-extent filesystem is because it doesn't support unwritten extents. So in such case, if we submit an aio-dio we on a hole (inside file i_size), then a racing buffered read can race with it. That is, if the block allocation happens via aio-dio then the buffered read can read stale data from that block which is within inode i_size. This is not a problem for extending file writes because we anyway update file i_size after the write is done (and aio-dio is also synchronous against extending file writes but that's is to avoid race against other aio-dio). And this is not a problem for fileystems which supports unwritten extent because we can always allocate an unwritten extent inside a hole and then a racing buffered read cannot read stale data (because unwritten extents returns 0). Then unwritten to written conversion can happen at the end once the data is already written to these blocks. >> Another query - >> >> We have this function ext2_iomap_end() (pasted below) >> which calls ext2_write_failed(). >> >> Here IMO two cases are possible - >> >> 1. written is 0. which means an error has occurred. >> In that case calling ext2_write_failed() make sense. >> >> 2. consider a case where written > 0 && written < length. >> (This is possible right?). In that case we still go and call >> ext2_write_failed(). This function will truncate the pagecache and disk >> blocks beyong i_size. Now we haven't yet updated inode->i_size (we do >> that in ->end_io which gets called in the end during completion) >> So that means it just removes everything. >> >> Then in ext2_dax_write_iter(), we might go and update inode->i_size >> to iocb->ki_pos including for short writes. This looks like it isn't >> consistent because earlier we had destroyed all the blocks for the short >> writes and we will be returning ret > 0 to the user saying these many >> bytes have been written. >> Again I haven't yet found a test case at least not in xfstests which >> can trigger this short writes. Let me know your thoughts on this. >> All of this lies on the fact that there can be a case where >> written > 0 && written < length. I will read more to see if this even >> happens or not. But I atleast wanted to capture this somewhere. > > So as far as I remember, direct IO writes as implemented in iomap are > all-or-nothing (see iomap_dio_complete()). But it would be good to assert > that in ext4 code to avoid surprises if the generic code changes. > Ok, I was talking about error paths. But I think I will park this one to go through later. >> Another thing - >> In dax while truncating the inode i_size in ext2_setsize(), >> I think we don't properly call dax_zero_blocks() when we are trying to >> zero the last block beyond EOF. i.e. for e.g. it can be called with len >> as 0 if newsize is page_aligned. It then will call ext2_get_blocks() with >> len = 0 and can bug_on at maxblocks == 0. > > How will it call ext2_get_blocks() with len == 0? AFAICS iomap_iter() will > not call iomap_begin() at all if iter.len == 0. > It can. In dax_zero_range() if we pass len = 0, then iomap_iter() can get can get called with 0 len and it will call ->iomap_begin() with 0 len. Maybe a WARN_ON() would certainly help in iomap_iter() to check against 0 iter->len to be passed to ->iomap_begin() (Note iter->len and iter->iomap.length are different) >> I think it should be this. I will spend some more time analyzing this >> and also test it once against DAX paths. >> >> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c >> index 7ff669d0b6d2..cc264b1e288c 100644 >> --- a/fs/ext2/inode.c >> +++ b/fs/ext2/inode.c >> @@ -1243,9 +1243,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) >> inode_dio_wait(inode); >> >> if (IS_DAX(inode)) >> - error = dax_zero_range(inode, newsize, >> - PAGE_ALIGN(newsize) - newsize, NULL, >> - &ext2_iomap_ops); >> + error = dax_truncate_page(inode, newsize, NULL, >> + &ext2_iomap_ops); >> else >> error = block_truncate_page(inode->i_mapping, >> newsize, ext2_get_block); > > That being said this is indeed a nice cleanup. Sure. I will add this patch in the beginning of the next revision. -ritesh
On Thu, Mar 23, 2023 at 12:30:30PM +0100, Jan Kara wrote: > > > > One way which hch is suggesting is to use __iomap_dio_rw() -> unlock > > inode -> call generic_write_sync(). I haven't yet worked on this part. > > So I see two problems with what Christoph suggests: > > a) It is unfortunate API design to require trivial (and low maintenance) > filesystem to do these relatively complex locking games. But this can > be solved by providing appropriate wrapper for them I guess. Well, the problem is that this "trivial" file systems have a pretty broken locking scheme for fsync. The legacy dio code gets around this by unlocking i_rwsem inside of __blockdev_direct_IO. Which force a particular and somewhat odd locking scheme on the callers, and severly limits it what it can do. > > Are you suggesting to rip of inode_lock from __generic_file_fsync()? > > Won't it have a much larger implications? > > Yes and yes :). But inode writeback already happens from other paths > without inode_lock so there's hardly any surprise there. > sync_mapping_buffers() is impossible to "customize" by filesystems and the > generic code is fine without inode_lock. So I have hard time imagining how > any filesystem would really depend on inode_lock in this path (famous last > words ;)). Not holding the inode lock in ->fsync would solve all of this indeed.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index cb78d7dcfb95..cb5e309fe040 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -753,6 +753,7 @@ extern unsigned long ext2_count_free (struct buffer_head *, unsigned); extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); +extern void ext2_write_failed(struct address_space *mapping, loff_t to); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct mnt_idmap *, struct dentry *, struct iattr *); extern int ext2_getattr (struct mnt_idmap *, const struct path *, diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 6b4bebe982ca..7a8561304559 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -161,12 +161,123 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } +static ssize_t ext2_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + + inode_lock_shared(inode); + ret = iomap_dio_rw(iocb, to, &ext2_iomap_ops, NULL, 0, NULL, 0); + inode_unlock_shared(inode); + + return ret; +} + +static int ext2_dio_write_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned int flags) +{ + loff_t pos = iocb->ki_pos; + struct inode *inode = file_inode(iocb->ki_filp); + + if (error) + return error; + + pos += size; + if (pos > i_size_read(inode)) + i_size_write(inode, pos); + + return 0; +} + +static const struct iomap_dio_ops ext2_dio_write_ops = { + .end_io = ext2_dio_write_end_io, +}; + +static ssize_t ext2_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + ssize_t ret; + unsigned int flags; + unsigned long blocksize = inode->i_sb->s_blocksize; + loff_t offset = iocb->ki_pos; + loff_t count = iov_iter_count(from); + + + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out_unlock; + ret = file_remove_privs(file); + if (ret) + goto out_unlock; + ret = file_update_time(file); + if (ret) + goto out_unlock; + + /* + * We pass IOMAP_DIO_NOSYNC because otherwise iomap_dio_rw() + * calls for generic_write_sync in iomap_dio_complete(). + * Since ext2_fsync nmust be called w/o inode lock, + * hence we pass IOMAP_DIO_NOSYNC and handle generic_write_sync() + * ourselves. + */ + flags = IOMAP_DIO_NOSYNC; + + /* use IOMAP_DIO_FORCE_WAIT for unaligned of extending writes */ + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode) || + (!IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(from), blocksize))) + flags |= IOMAP_DIO_FORCE_WAIT; + + ret = iomap_dio_rw(iocb, from, &ext2_iomap_ops, &ext2_dio_write_ops, + flags, NULL, 0); + + if (ret == -ENOTBLK) + ret = 0; + + if (ret < 0 && ret != -EIOCBQUEUED) + ext2_write_failed(inode->i_mapping, offset + count); + + /* handle case for partial write or fallback to buffered write */ + if (ret >= 0 && iov_iter_count(from)) { + loff_t pos, endbyte; + ssize_t status; + ssize_t ret2; + + pos = iocb->ki_pos; + status = generic_perform_write(iocb, from); + if (unlikely(status < 0)) { + ret = status; + goto out_unlock; + } + endbyte = pos + status - 1; + ret2 = filemap_write_and_wait_range(inode->i_mapping, pos, + endbyte); + if (ret2 == 0) { + iocb->ki_pos = endbyte + 1; + ret += status; + invalidate_mapping_pages(inode->i_mapping, + pos >> PAGE_SHIFT, + endbyte >> PAGE_SHIFT); + } + } +out_unlock: + inode_unlock(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { #ifdef CONFIG_FS_DAX if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_read_iter(iocb, to); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext2_dio_read_iter(iocb, to); + return generic_file_read_iter(iocb, to); } @@ -176,6 +287,9 @@ static ssize_t ext2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (IS_DAX(iocb->ki_filp->f_mapping->host)) return ext2_dax_write_iter(iocb, from); #endif + if (iocb->ki_flags & IOCB_DIRECT) + return ext2_dio_write_iter(iocb, from); + return generic_file_write_iter(iocb, from); } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 26f135e7ffce..7ff669d0b6d2 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -56,7 +56,7 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode) static void ext2_truncate_blocks(struct inode *inode, loff_t offset); -static void ext2_write_failed(struct address_space *mapping, loff_t to) +void ext2_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -908,22 +908,6 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping,block,ext2_get_block); } -static ssize_t -ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - size_t count = iov_iter_count(iter); - loff_t offset = iocb->ki_pos; - ssize_t ret; - - ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); - if (ret < 0 && iov_iter_rw(iter) == WRITE) - ext2_write_failed(mapping, offset + count); - return ret; -} - static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -946,7 +930,7 @@ const struct address_space_operations ext2_aops = { .write_begin = ext2_write_begin, .write_end = ext2_write_end, .bmap = ext2_bmap, - .direct_IO = ext2_direct_IO, + .direct_IO = noop_direct_IO, .writepages = ext2_writepages, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate,
[DO NOT MERGE] [WORK-IN-PROGRESS] Hello Jan, This is an initial version of the patch set which I wanted to share before today's call. This is still work in progress but atleast passes the set of test cases which I had kept for dio testing (except 1 from my list). Looks like there won't be much/any changes required from iomap side to support ext2 moving to iomap apis. I will be doing some more testing specifically test generic/083 which is occassionally failing in my testing. Also once this is stabilized, I can do some performance testing too if you feel so. Last I remembered we saw some performance regressions when ext4 moved to iomap for dio. PS: Please ignore if there are some silly mistakes. As I said, I wanted to get this out before today's discussion. :) Thanks for your help!! Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> --- fs/ext2/ext2.h | 1 + fs/ext2/file.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext2/inode.c | 20 +-------- 3 files changed, 117 insertions(+), 18 deletions(-) -- 2.39.2