Message ID | 1478603297-11793-4-git-send-email-jack@suse.cz (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Nov 08, 2016 at 12:08:09PM +0100, Jan Kara wrote: > Implement basic iomap_begin function that handles reading and use it for > DAX reads. > > Signed-off-by: Jan Kara <jack@suse.cz> > --- > fs/ext4/ext4.h | 2 ++ > fs/ext4/file.c | 38 +++++++++++++++++++++++++++++++++++++- > fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 93 insertions(+), 1 deletion(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 282a51b07c57..098b39910001 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -3271,6 +3271,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) > return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); > } > > +extern struct iomap_ops ext4_iomap_ops; > + > #endif /* __KERNEL__ */ > > #define EFSBADCRC EBADMSG /* Bad CRC detected */ > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > index 9facb4dc5c70..1f25c644cb12 100644 > --- a/fs/ext4/file.c > +++ b/fs/ext4/file.c > @@ -31,6 +31,42 @@ > #include "xattr.h" > #include "acl.h" > > +#ifdef CONFIG_FS_DAX > +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + ssize_t ret; > + > + inode_lock_shared(inode); > + /* > + * Recheck under inode lock - at this point we are sure it cannot > + * change anymore > + */ > + if (!IS_DAX(inode)) { > + inode_unlock_shared(inode); > + /* Fallback to buffered IO in case we cannot support DAX */ > + return generic_file_read_iter(iocb, to); Is this not also racy, since we've just dropped the inode lock? What's to prevent this sequence? Thread 0 Thread 1 -------- -------- ext4_file_read_iter() IS_DAX() returns true changes S_DAX to false ext4_dax_read_iter() inode_lock_shared() IS_DAX() returns false inode_unlock_shared() changes S_DAX to true generic_file_read_iter() on a DAX inode Or are we okay in this scenario? -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu 10-11-16 14:54:31, Ross Zwisler wrote: > On Tue, Nov 08, 2016 at 12:08:09PM +0100, Jan Kara wrote: > > Implement basic iomap_begin function that handles reading and use it for > > DAX reads. > > > > Signed-off-by: Jan Kara <jack@suse.cz> > > --- > > fs/ext4/ext4.h | 2 ++ > > fs/ext4/file.c | 38 +++++++++++++++++++++++++++++++++++++- > > fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > > 3 files changed, 93 insertions(+), 1 deletion(-) > > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > > index 282a51b07c57..098b39910001 100644 > > --- a/fs/ext4/ext4.h > > +++ b/fs/ext4/ext4.h > > @@ -3271,6 +3271,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) > > return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); > > } > > > > +extern struct iomap_ops ext4_iomap_ops; > > + > > #endif /* __KERNEL__ */ > > > > #define EFSBADCRC EBADMSG /* Bad CRC detected */ > > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > > index 9facb4dc5c70..1f25c644cb12 100644 > > --- a/fs/ext4/file.c > > +++ b/fs/ext4/file.c > > @@ -31,6 +31,42 @@ > > #include "xattr.h" > > #include "acl.h" > > > > +#ifdef CONFIG_FS_DAX > > +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) > > +{ > > + struct inode *inode = file_inode(iocb->ki_filp); > > + ssize_t ret; > > + > > + inode_lock_shared(inode); > > + /* > > + * Recheck under inode lock - at this point we are sure it cannot > > + * change anymore > > + */ > > + if (!IS_DAX(inode)) { > > + inode_unlock_shared(inode); > > + /* Fallback to buffered IO in case we cannot support DAX */ > > + return generic_file_read_iter(iocb, to); > > Is this not also racy, since we've just dropped the inode lock? What's to > prevent this sequence? > > Thread 0 Thread 1 > -------- -------- > ext4_file_read_iter() > IS_DAX() returns true > changes S_DAX to false > ext4_dax_read_iter() > inode_lock_shared() > IS_DAX() returns false > inode_unlock_shared() > changes S_DAX to true > generic_file_read_iter() on a DAX inode > > > Or are we okay in this scenario? Yup, I'm aware of this. The real problem is that there's no way to serialize with buffered reads for ext4 (they take only page locks) so currently you can have buffered reads in flight when inode gets switched to DAX mode. I agree there is a potential for breakage and it needs to be resolved eventually but the problem is not new and these patches don't make it really any worse so I just somewhat fixed it up by patch 2/11 and left full solution to a separate patch set. Honza
On Fri, Nov 11, 2016 at 11:17:51AM +0100, Jan Kara wrote: > On Thu 10-11-16 14:54:31, Ross Zwisler wrote: > > On Tue, Nov 08, 2016 at 12:08:09PM +0100, Jan Kara wrote: > > > Implement basic iomap_begin function that handles reading and use it for > > > DAX reads. > > > > > > Signed-off-by: Jan Kara <jack@suse.cz> > > > --- > > > fs/ext4/ext4.h | 2 ++ > > > fs/ext4/file.c | 38 +++++++++++++++++++++++++++++++++++++- > > > fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > > > 3 files changed, 93 insertions(+), 1 deletion(-) > > > > > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > > > index 282a51b07c57..098b39910001 100644 > > > --- a/fs/ext4/ext4.h > > > +++ b/fs/ext4/ext4.h > > > @@ -3271,6 +3271,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) > > > return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); > > > } > > > > > > +extern struct iomap_ops ext4_iomap_ops; > > > + > > > #endif /* __KERNEL__ */ > > > > > > #define EFSBADCRC EBADMSG /* Bad CRC detected */ > > > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > > > index 9facb4dc5c70..1f25c644cb12 100644 > > > --- a/fs/ext4/file.c > > > +++ b/fs/ext4/file.c > > > @@ -31,6 +31,42 @@ > > > #include "xattr.h" > > > #include "acl.h" > > > > > > +#ifdef CONFIG_FS_DAX > > > +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) > > > +{ > > > + struct inode *inode = file_inode(iocb->ki_filp); > > > + ssize_t ret; > > > + > > > + inode_lock_shared(inode); > > > + /* > > > + * Recheck under inode lock - at this point we are sure it cannot > > > + * change anymore > > > + */ > > > + if (!IS_DAX(inode)) { > > > + inode_unlock_shared(inode); > > > + /* Fallback to buffered IO in case we cannot support DAX */ > > > + return generic_file_read_iter(iocb, to); > > > > Is this not also racy, since we've just dropped the inode lock? What's to > > prevent this sequence? > > > > Thread 0 Thread 1 > > -------- -------- > > ext4_file_read_iter() > > IS_DAX() returns true > > changes S_DAX to false > > ext4_dax_read_iter() > > inode_lock_shared() > > IS_DAX() returns false > > inode_unlock_shared() > > changes S_DAX to true > > generic_file_read_iter() on a DAX inode > > > > > > Or are we okay in this scenario? > > Yup, I'm aware of this. The real problem is that there's no way to > serialize with buffered reads for ext4 (they take only page locks) so > currently you can have buffered reads in flight when inode gets switched to > DAX mode. I agree there is a potential for breakage and it needs to be > resolved eventually but the problem is not new and these patches don't make > it really any worse so I just somewhat fixed it up by patch 2/11 and left > full solution to a separate patch set. Fair enough. You can add: Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 282a51b07c57..098b39910001 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3271,6 +3271,8 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); } +extern struct iomap_ops ext4_iomap_ops; + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 9facb4dc5c70..1f25c644cb12 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,6 +31,42 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + inode_lock_shared(inode); + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(file_inode(iocb->ki_filp))) + return ext4_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release @@ -690,7 +726,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read_iter = generic_file_read_iter, + .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5337828c68a7..83e8411370d3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/iomap.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -3310,6 +3311,59 @@ int ext4_dax_get_block(struct inode *inode, sector_t iblock, clear_buffer_new(bh_result); return 0; } + +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) +{ + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long last_block = (offset + length - 1) >> blkbits; + struct ext4_map_blocks map; + int ret; + + if (flags & IOMAP_WRITE) + return -EIO; + + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; + + map.m_lblk = first_block; + map.m_len = last_block - first_block + 1; + + ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = (u64)map.m_len << blkbits; + } else { + if (map.m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + } else { + WARN_ON_ONCE(1); + return -EIO; + } + iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); + iomap->length = (u64)map.m_len << blkbits; + } + + if (map.m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; + return 0; +} + +struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, +}; + #else /* Just define empty function, it will never get called. */ int ext4_dax_get_block(struct inode *inode, sector_t iblock,
Implement basic iomap_begin function that handles reading and use it for DAX reads. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext4/ext4.h | 2 ++ fs/ext4/file.c | 38 +++++++++++++++++++++++++++++++++++++- fs/ext4/inode.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-)