Message ID | 1530846750-6686-5-git-send-email-shan.hai@oracle.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Fri, Jul 06, 2018 at 11:12:25AM +0800, Shan Hai wrote: > Implement read/write functions for inline data access and use them > for buffered/DIO/DAX operations. > > Signed-off-by: Shan Hai <shan.hai@oracle.com> > --- > fs/xfs/xfs_file.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 242 insertions(+), 2 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index a3e7767a5715..55047928b720 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -27,6 +27,7 @@ > #include "xfs_pnfs.h" > #include "xfs_iomap.h" > #include "xfs_reflink.h" > +#include "xfs_defer.h" > > #include <linux/dcache.h> > #include <linux/falloc.h> > @@ -175,6 +176,185 @@ xfs_file_fsync( > return error; > } > > +/* > + * Read the inline data of a local format file. > + */ > +STATIC ssize_t > +xfs_file_inline_data_read( > + struct xfs_inode *ip, > + int whichfork, > + struct kiocb *iocb, > + struct iov_iter *to) > +{ > + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork); > + struct inode *inode = VFS_I(ip); > + loff_t isize = i_size_read(inode); > + loff_t *ppos = &iocb->ki_pos; > + size_t count = iov_iter_count(to); > + ssize_t ret = 0; > + > + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || > + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); > + > + if (!count) > + return 0; > + > + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) > + return 0; > + > + if (*ppos > isize) > + return 0; > + > + iov_iter_truncate(to, inode->i_sb->s_maxbytes); > + count = (*ppos + count > isize) ? isize - *ppos : count; > + ret = copy_to_iter(ifp->if_u1.if_data + *ppos, count, to); > + *ppos += ret; > + > + return ret; > +} > + > +/* > + * Try to inline the data into the xfs inode data fork when it has enough > + * space to hold it. The data inlining happens at below points: > + * > + * - writeback: for buffered writes > + * - write_iter: for dax/dio writes > + * > + * The extents to local format conversion is done here for DAX/DIO write, > + * while the same conversion is done in the writeback path for buffered write. > + * This function also converts inode from local to extents when the data > + * fork could not hold the data inline anymore. > + */ > +STATIC ssize_t > +xfs_file_inline_data_write( > + struct xfs_inode *ip, > + int whichfork, > + struct kiocb *iocb, > + struct iov_iter *from) > +{ > + struct xfs_mount *mp = ip->i_mount; > + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork); > + struct inode *inode = VFS_I(ip); > + struct address_space *mapping = inode->i_mapping; > + loff_t pos = iocb->ki_pos; > + size_t count = iov_iter_count(from); > + struct xfs_trans *tp; > + struct xfs_defer_ops dfops; > + xfs_fsblock_t first_block; > + int ret; > + int error = 0; > + int logflags = 0; > + loff_t written = 0; > + xfs_fileoff_t bno = 0; > + struct xfs_bmbt_irec map; > + int nmap; > + struct page *page; > + char *kaddr; > + > + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || > + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); > + > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 1, 0, 0, &tp); > + if (error) { > + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); > + return error; > + } > + > + xfs_ilock(ip, XFS_ILOCK_EXCL); > + xfs_trans_ijoin(tp, ip, 0); > + xfs_defer_init(&dfops, &first_block); > + > + if (pos + count > XFS_IFORK_DSIZE(ip)) { > + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS) { > + error = 0; > + goto trans_cancel; > + } > + > + page = find_or_create_page(mapping, pos >> PAGE_SHIFT, GFP_NOFS); > + if (!page) { > + error = -ENOMEM; > + goto trans_cancel; > + } > + kaddr = kmap_atomic(page); > + memcpy(kaddr, ifp->if_u1.if_data, ifp->if_bytes); > + kunmap_atomic(kaddr); > + SetPageUptodate(page); > + unlock_page(page); > + put_page(page); > + > + xfs_bmap_forkoff_reset(ip, whichfork); > + ifp->if_flags &= ~XFS_IFINLINE; > + ifp->if_flags |= XFS_IFEXTENTS; > + ifp->if_u1.if_root = NULL; > + ifp->if_height = 0; > + ifp->if_bytes = 0; > + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); > + > + written = ifp->if_bytes; > + error = xfs_bmap_first_unused(tp, ip, 1, &bno, whichfork); > + if (error) > + goto trans_cancel; > + nmap = 1; > + error = xfs_bmapi_write(tp, ip, bno, 1, XFS_BMAPI_ENTIRE, > + &first_block, 1, &map, &nmap, &dfops); > + if (error) > + goto trans_cancel; > + > + goto out_local_to_extents; > + } > + > + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { > + xfs_idata_realloc(ip, (pos + count) - ifp->if_bytes, whichfork); > + ret = copy_from_iter(ifp->if_u1.if_data + pos, count, from); > + written = ret; > + goto out; > + } > + > + if (IS_DAX(inode) || iocb->ki_flags & IOCB_DIRECT) { > + struct page *page = alloc_page(GFP_KERNEL | GFP_NOFS); > + if (!page) { > + error = -ENOMEM; > + goto trans_cancel; > + } > + > + kaddr = kmap_atomic(page); > + ret = copy_from_iter(kaddr + pos, count, from); > + kunmap_atomic(kaddr); > + written = ret; > + error = xfs_bmap_extents_to_local(tp, ip, &dfops, &logflags, > + XFS_DATA_FORK, page); > + if (error) { > + __free_page(page); > + goto trans_cancel; > + } > + > + __free_page(page); > + goto out; > + } > +trans_cancel: > + xfs_defer_cancel(&dfops); > + xfs_trans_cancel(tp); > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > + return error; > +out: > + if (pos + written > i_size_read(inode)) > + i_size_write(inode, pos + written); > + ip->i_d.di_size = pos + written; > + > +out_local_to_extents: > + logflags |= (XFS_ILOG_DDATA | XFS_ILOG_CORE); > + xfs_trans_log_inode(tp, ip, logflags); > + > + error = xfs_defer_finish(&tp, &dfops); > + if (error) > + goto trans_cancel; > + error = xfs_trans_commit(tp); > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > + > + return written > 0 ? written : error; > +} > + > + > STATIC ssize_t > xfs_file_dio_aio_read( > struct kiocb *iocb, > @@ -192,7 +372,12 @@ xfs_file_dio_aio_read( > file_accessed(iocb->ki_filp); > > xfs_ilock(ip, XFS_IOLOCK_SHARED); > - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); > + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && > + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { > + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); I thought iomap was growing the ability to handle inline data? At least for gfs2? Hmm, you might ask Christoph about how to support that... also these read/write paths are about to get a lot of rip/smash this cycle. > + } else { > + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); > + } > xfs_iunlock(ip, XFS_IOLOCK_SHARED); > > return ret; > @@ -219,6 +404,13 @@ xfs_file_dax_read( > xfs_ilock(ip, XFS_IOLOCK_SHARED); > } > > + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && > + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { > + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); > + } else { > + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); > + } > + > ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); > xfs_iunlock(ip, XFS_IOLOCK_SHARED); > > @@ -242,7 +434,12 @@ xfs_file_buffered_aio_read( > } else { > xfs_ilock(ip, XFS_IOLOCK_SHARED); > } > - ret = generic_file_read_iter(iocb, to); > + > + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { > + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); > + } else { > + ret = generic_file_read_iter(iocb, to); > + } > xfs_iunlock(ip, XFS_IOLOCK_SHARED); > > return ret; > @@ -487,6 +684,7 @@ xfs_file_dio_aio_write( > size_t count = iov_iter_count(from); > struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? > mp->m_rtdev_targp : mp->m_ddev_targp; > + int error; > > /* DIO must be aligned to device logical sector size */ > if ((iocb->ki_pos | count) & target->bt_logical_sectormask) > @@ -547,6 +745,22 @@ xfs_file_dio_aio_write( > } > > trace_xfs_file_direct_write(ip, count, iocb->ki_pos); > + > + if (xfs_sb_version_hasinlinedata(&mp->m_sb) && > + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE && != BTREE? Why not == INLINE? > + S_ISREG(inode->i_mode)) { > + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, > + from); int != ssize_t... also, if we decided we needed extents format but the file turned out to be extents format already then did we lose the write here? > + if (error) { > + ret = error; > + goto out; > + } > + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { > + ret = count; > + goto out; > + } > + } > + > ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); > out: > xfs_iunlock(ip, iolock); > @@ -586,6 +800,20 @@ xfs_file_dax_write( > count = iov_iter_count(from); > > trace_xfs_file_dax_write(ip, count, pos); > + > + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && > + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && > + S_ISREG(inode->i_mode)) { > + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, > + iocb, from); > + if (error) > + goto out; > + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { > + ret = count; > + goto out; > + } > + } > + > ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); > if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { > i_size_write(inode, iocb->ki_pos); > @@ -614,6 +842,7 @@ xfs_file_buffered_aio_write( > struct address_space *mapping = file->f_mapping; > struct inode *inode = mapping->host; > struct xfs_inode *ip = XFS_I(inode); > + struct xfs_mount *mp = ip->i_mount; > ssize_t ret; > int enospc = 0; > int iolock; > @@ -629,6 +858,17 @@ xfs_file_buffered_aio_write( > if (ret) > goto out; > > + if (xfs_sb_version_hasinlinedata(&mp->m_sb) && > + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE) { > + ret = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, from); > + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == > + XFS_DINODE_FMT_LOCAL){ What's going on with the indenting here? --D > + if (likely(ret >= 0)) > + iocb->ki_pos += ret; > + goto out; > + } > + } > + > /* We can write back this queue in page reclaim */ > current->backing_dev_info = inode_to_bdi(inode); > > -- > 2.11.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018年07月06日 11:33, Darrick J. Wong wrote: > On Fri, Jul 06, 2018 at 11:12:25AM +0800, Shan Hai wrote: >> Implement read/write functions for inline data access and use them >> for buffered/DIO/DAX operations. >> >> Signed-off-by: Shan Hai <shan.hai@oracle.com> >> --- >> fs/xfs/xfs_file.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++- >> 1 file changed, 242 insertions(+), 2 deletions(-) >> >> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c >> index a3e7767a5715..55047928b720 100644 >> --- a/fs/xfs/xfs_file.c >> +++ b/fs/xfs/xfs_file.c >> @@ -27,6 +27,7 @@ >> #include "xfs_pnfs.h" >> #include "xfs_iomap.h" >> #include "xfs_reflink.h" >> +#include "xfs_defer.h" >> >> #include <linux/dcache.h> >> #include <linux/falloc.h> >> @@ -175,6 +176,185 @@ xfs_file_fsync( >> return error; >> } >> >> +/* >> + * Read the inline data of a local format file. >> + */ >> +STATIC ssize_t >> +xfs_file_inline_data_read( >> + struct xfs_inode *ip, >> + int whichfork, >> + struct kiocb *iocb, >> + struct iov_iter *to) >> +{ >> + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork); >> + struct inode *inode = VFS_I(ip); >> + loff_t isize = i_size_read(inode); >> + loff_t *ppos = &iocb->ki_pos; >> + size_t count = iov_iter_count(to); >> + ssize_t ret = 0; >> + >> + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || >> + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); >> + >> + if (!count) >> + return 0; >> + >> + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) >> + return 0; >> + >> + if (*ppos > isize) >> + return 0; >> + >> + iov_iter_truncate(to, inode->i_sb->s_maxbytes); >> + count = (*ppos + count > isize) ? isize - *ppos : count; >> + ret = copy_to_iter(ifp->if_u1.if_data + *ppos, count, to); >> + *ppos += ret; >> + >> + return ret; >> +} >> + >> +/* >> + * Try to inline the data into the xfs inode data fork when it has enough >> + * space to hold it. The data inlining happens at below points: >> + * >> + * - writeback: for buffered writes >> + * - write_iter: for dax/dio writes >> + * >> + * The extents to local format conversion is done here for DAX/DIO write, >> + * while the same conversion is done in the writeback path for buffered write. >> + * This function also converts inode from local to extents when the data >> + * fork could not hold the data inline anymore. >> + */ >> +STATIC ssize_t >> +xfs_file_inline_data_write( >> + struct xfs_inode *ip, >> + int whichfork, >> + struct kiocb *iocb, >> + struct iov_iter *from) >> +{ >> + struct xfs_mount *mp = ip->i_mount; >> + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork); >> + struct inode *inode = VFS_I(ip); >> + struct address_space *mapping = inode->i_mapping; >> + loff_t pos = iocb->ki_pos; >> + size_t count = iov_iter_count(from); >> + struct xfs_trans *tp; >> + struct xfs_defer_ops dfops; >> + xfs_fsblock_t first_block; >> + int ret; >> + int error = 0; >> + int logflags = 0; >> + loff_t written = 0; >> + xfs_fileoff_t bno = 0; >> + struct xfs_bmbt_irec map; >> + int nmap; >> + struct page *page; >> + char *kaddr; >> + >> + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || >> + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); >> + >> + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 1, 0, 0, &tp); >> + if (error) { >> + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); >> + return error; >> + } >> + >> + xfs_ilock(ip, XFS_ILOCK_EXCL); >> + xfs_trans_ijoin(tp, ip, 0); >> + xfs_defer_init(&dfops, &first_block); >> + >> + if (pos + count > XFS_IFORK_DSIZE(ip)) { >> + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS) { >> + error = 0; >> + goto trans_cancel; >> + } >> + >> + page = find_or_create_page(mapping, pos >> PAGE_SHIFT, GFP_NOFS); >> + if (!page) { >> + error = -ENOMEM; >> + goto trans_cancel; >> + } >> + kaddr = kmap_atomic(page); >> + memcpy(kaddr, ifp->if_u1.if_data, ifp->if_bytes); >> + kunmap_atomic(kaddr); >> + SetPageUptodate(page); >> + unlock_page(page); >> + put_page(page); >> + >> + xfs_bmap_forkoff_reset(ip, whichfork); >> + ifp->if_flags &= ~XFS_IFINLINE; >> + ifp->if_flags |= XFS_IFEXTENTS; >> + ifp->if_u1.if_root = NULL; >> + ifp->if_height = 0; >> + ifp->if_bytes = 0; >> + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); >> + >> + written = ifp->if_bytes; >> + error = xfs_bmap_first_unused(tp, ip, 1, &bno, whichfork); >> + if (error) >> + goto trans_cancel; >> + nmap = 1; >> + error = xfs_bmapi_write(tp, ip, bno, 1, XFS_BMAPI_ENTIRE, >> + &first_block, 1, &map, &nmap, &dfops); >> + if (error) >> + goto trans_cancel; >> + >> + goto out_local_to_extents; >> + } >> + >> + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { >> + xfs_idata_realloc(ip, (pos + count) - ifp->if_bytes, whichfork); >> + ret = copy_from_iter(ifp->if_u1.if_data + pos, count, from); >> + written = ret; >> + goto out; >> + } >> + >> + if (IS_DAX(inode) || iocb->ki_flags & IOCB_DIRECT) { >> + struct page *page = alloc_page(GFP_KERNEL | GFP_NOFS); >> + if (!page) { >> + error = -ENOMEM; >> + goto trans_cancel; >> + } >> + >> + kaddr = kmap_atomic(page); >> + ret = copy_from_iter(kaddr + pos, count, from); >> + kunmap_atomic(kaddr); >> + written = ret; >> + error = xfs_bmap_extents_to_local(tp, ip, &dfops, &logflags, >> + XFS_DATA_FORK, page); >> + if (error) { >> + __free_page(page); >> + goto trans_cancel; >> + } >> + >> + __free_page(page); >> + goto out; >> + } >> +trans_cancel: >> + xfs_defer_cancel(&dfops); >> + xfs_trans_cancel(tp); >> + xfs_iunlock(ip, XFS_ILOCK_EXCL); >> + return error; >> +out: >> + if (pos + written > i_size_read(inode)) >> + i_size_write(inode, pos + written); >> + ip->i_d.di_size = pos + written; >> + >> +out_local_to_extents: >> + logflags |= (XFS_ILOG_DDATA | XFS_ILOG_CORE); >> + xfs_trans_log_inode(tp, ip, logflags); >> + >> + error = xfs_defer_finish(&tp, &dfops); >> + if (error) >> + goto trans_cancel; >> + error = xfs_trans_commit(tp); >> + xfs_iunlock(ip, XFS_ILOCK_EXCL); >> + >> + return written > 0 ? written : error; >> +} >> + >> + >> STATIC ssize_t >> xfs_file_dio_aio_read( >> struct kiocb *iocb, >> @@ -192,7 +372,12 @@ xfs_file_dio_aio_read( >> file_accessed(iocb->ki_filp); >> >> xfs_ilock(ip, XFS_IOLOCK_SHARED); >> - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); >> + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && >> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { >> + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); > I thought iomap was growing the ability to handle inline data? At least > for gfs2? > > Hmm, you might ask Christoph about how to support that... also these > read/write paths are about to get a lot of rip/smash this cycle. > >> + } else { >> + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); >> + } >> xfs_iunlock(ip, XFS_IOLOCK_SHARED); >> >> return ret; >> @@ -219,6 +404,13 @@ xfs_file_dax_read( >> xfs_ilock(ip, XFS_IOLOCK_SHARED); >> } >> >> + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && >> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { >> + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); >> + } else { >> + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); >> + } >> + >> ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); >> xfs_iunlock(ip, XFS_IOLOCK_SHARED); >> >> @@ -242,7 +434,12 @@ xfs_file_buffered_aio_read( >> } else { >> xfs_ilock(ip, XFS_IOLOCK_SHARED); >> } >> - ret = generic_file_read_iter(iocb, to); >> + >> + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { >> + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); >> + } else { >> + ret = generic_file_read_iter(iocb, to); >> + } >> xfs_iunlock(ip, XFS_IOLOCK_SHARED); >> >> return ret; >> @@ -487,6 +684,7 @@ xfs_file_dio_aio_write( >> size_t count = iov_iter_count(from); >> struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? >> mp->m_rtdev_targp : mp->m_ddev_targp; >> + int error; >> >> /* DIO must be aligned to device logical sector size */ >> if ((iocb->ki_pos | count) & target->bt_logical_sectormask) >> @@ -547,6 +745,22 @@ xfs_file_dio_aio_write( >> } >> >> trace_xfs_file_direct_write(ip, count, iocb->ki_pos); >> + >> + if (xfs_sb_version_hasinlinedata(&mp->m_sb) && >> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE && > != BTREE? Why not == INLINE? The xfs inode is born with EXTENTS format and the very first write to a file will encounter the EXTENTS since it's not converted to INLINE yet, the format conversion happens in the following xfs_file_inline_data_write. >> + S_ISREG(inode->i_mode)) { >> + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, >> + from); > int != ssize_t... also, if we decided we needed extents format but the > file turned out to be extents format already then did we lose the write > here? If the file is already in the extents format then the above xfs_file_inline_data_write just returns without format conversion and the control flow falls through to the normal extents handling because of the following if statement found that the format is not LOCAL. >> + if (error) { >> + ret = error; >> + goto out; >> + } >> + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { >> + ret = count; >> + goto out; >> + } >> + } >> + >> ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); >> out: >> xfs_iunlock(ip, iolock); >> @@ -586,6 +800,20 @@ xfs_file_dax_write( >> count = iov_iter_count(from); >> >> trace_xfs_file_dax_write(ip, count, pos); >> + >> + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && >> + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && >> + S_ISREG(inode->i_mode)) { >> + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, >> + iocb, from); >> + if (error) >> + goto out; >> + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { >> + ret = count; >> + goto out; >> + } >> + } >> + >> ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); >> if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { >> i_size_write(inode, iocb->ki_pos); >> @@ -614,6 +842,7 @@ xfs_file_buffered_aio_write( >> struct address_space *mapping = file->f_mapping; >> struct inode *inode = mapping->host; >> struct xfs_inode *ip = XFS_I(inode); >> + struct xfs_mount *mp = ip->i_mount; >> ssize_t ret; >> int enospc = 0; >> int iolock; >> @@ -629,6 +858,17 @@ xfs_file_buffered_aio_write( >> if (ret) >> goto out; >> >> + if (xfs_sb_version_hasinlinedata(&mp->m_sb) && >> + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE) { >> + ret = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, from); >> + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == >> + XFS_DINODE_FMT_LOCAL){ > What's going on with the indenting here? Oops, sorry, I will fix it. Thanks Shan Hai > > --D > >> + if (likely(ret >= 0)) >> + iocb->ki_pos += ret; >> + goto out; >> + } >> + } >> + >> /* We can write back this queue in page reclaim */ >> current->backing_dev_info = inode_to_bdi(inode); >> >> -- >> 2.11.0 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Fri, Jul 06, 2018 at 11:12:25AM +0800, Shan Hai wrote: > Implement read/write functions for inline data access and use them > for buffered/DIO/DAX operations. Please use the iomap inline handling in this branch instead of implementing it in XFS: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git/log/?h=iomap-4.19-merge It seems like your code doesn't use the page cache for inline data at the moment, which means you'd want to always use the direct I/O version of the code as-is. Althought that makes me wonder how you handle memory mapped I/O, but I guess I'll find out later. -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 2018年07月08日 23:45, Christoph Hellwig wrote: > On Fri, Jul 06, 2018 at 11:12:25AM +0800, Shan Hai wrote: >> Implement read/write functions for inline data access and use them >> for buffered/DIO/DAX operations. > Please use the iomap inline handling in this branch instead of > implementing it in XFS: > > https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git/log/?h=iomap-4.19-merge OK, I will base the patches on the iomap inline handling code. > It seems like your code doesn't use the page cache for inline data > at the moment, which means you'd want to always use the direct I/O > version of the code as-is. Yes, the read path uses direct I/O version while the write is not. > Althought that makes me wonder how you handle memory mapped I/O, but > I guess I'll find out later. I will use page cache to handle the mmap case, thanks for the review. Thanks Shan Hai -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a3e7767a5715..55047928b720 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -27,6 +27,7 @@ #include "xfs_pnfs.h" #include "xfs_iomap.h" #include "xfs_reflink.h" +#include "xfs_defer.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -175,6 +176,185 @@ xfs_file_fsync( return error; } +/* + * Read the inline data of a local format file. + */ +STATIC ssize_t +xfs_file_inline_data_read( + struct xfs_inode *ip, + int whichfork, + struct kiocb *iocb, + struct iov_iter *to) +{ + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork); + struct inode *inode = VFS_I(ip); + loff_t isize = i_size_read(inode); + loff_t *ppos = &iocb->ki_pos; + size_t count = iov_iter_count(to); + ssize_t ret = 0; + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + + if (!count) + return 0; + + if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) + return 0; + + if (*ppos > isize) + return 0; + + iov_iter_truncate(to, inode->i_sb->s_maxbytes); + count = (*ppos + count > isize) ? isize - *ppos : count; + ret = copy_to_iter(ifp->if_u1.if_data + *ppos, count, to); + *ppos += ret; + + return ret; +} + +/* + * Try to inline the data into the xfs inode data fork when it has enough + * space to hold it. The data inlining happens at below points: + * + * - writeback: for buffered writes + * - write_iter: for dax/dio writes + * + * The extents to local format conversion is done here for DAX/DIO write, + * while the same conversion is done in the writeback path for buffered write. + * This function also converts inode from local to extents when the data + * fork could not hold the data inline anymore. + */ +STATIC ssize_t +xfs_file_inline_data_write( + struct xfs_inode *ip, + int whichfork, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_ifork_t *ifp = XFS_IFORK_PTR(ip, whichfork); + struct inode *inode = VFS_I(ip); + struct address_space *mapping = inode->i_mapping; + loff_t pos = iocb->ki_pos; + size_t count = iov_iter_count(from); + struct xfs_trans *tp; + struct xfs_defer_ops dfops; + xfs_fsblock_t first_block; + int ret; + int error = 0; + int logflags = 0; + loff_t written = 0; + xfs_fileoff_t bno = 0; + struct xfs_bmbt_irec map; + int nmap; + struct page *page; + char *kaddr; + + ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS || + XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 1, 0, 0, &tp); + if (error) { + ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + xfs_defer_init(&dfops, &first_block); + + if (pos + count > XFS_IFORK_DSIZE(ip)) { + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS) { + error = 0; + goto trans_cancel; + } + + page = find_or_create_page(mapping, pos >> PAGE_SHIFT, GFP_NOFS); + if (!page) { + error = -ENOMEM; + goto trans_cancel; + } + kaddr = kmap_atomic(page); + memcpy(kaddr, ifp->if_u1.if_data, ifp->if_bytes); + kunmap_atomic(kaddr); + SetPageUptodate(page); + unlock_page(page); + put_page(page); + + xfs_bmap_forkoff_reset(ip, whichfork); + ifp->if_flags &= ~XFS_IFINLINE; + ifp->if_flags |= XFS_IFEXTENTS; + ifp->if_u1.if_root = NULL; + ifp->if_height = 0; + ifp->if_bytes = 0; + XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + + written = ifp->if_bytes; + error = xfs_bmap_first_unused(tp, ip, 1, &bno, whichfork); + if (error) + goto trans_cancel; + nmap = 1; + error = xfs_bmapi_write(tp, ip, bno, 1, XFS_BMAPI_ENTIRE, + &first_block, 1, &map, &nmap, &dfops); + if (error) + goto trans_cancel; + + goto out_local_to_extents; + } + + if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + xfs_idata_realloc(ip, (pos + count) - ifp->if_bytes, whichfork); + ret = copy_from_iter(ifp->if_u1.if_data + pos, count, from); + written = ret; + goto out; + } + + if (IS_DAX(inode) || iocb->ki_flags & IOCB_DIRECT) { + struct page *page = alloc_page(GFP_KERNEL | GFP_NOFS); + if (!page) { + error = -ENOMEM; + goto trans_cancel; + } + + kaddr = kmap_atomic(page); + ret = copy_from_iter(kaddr + pos, count, from); + kunmap_atomic(kaddr); + written = ret; + error = xfs_bmap_extents_to_local(tp, ip, &dfops, &logflags, + XFS_DATA_FORK, page); + if (error) { + __free_page(page); + goto trans_cancel; + } + + __free_page(page); + goto out; + } +trans_cancel: + xfs_defer_cancel(&dfops); + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out: + if (pos + written > i_size_read(inode)) + i_size_write(inode, pos + written); + ip->i_d.di_size = pos + written; + +out_local_to_extents: + logflags |= (XFS_ILOG_DDATA | XFS_ILOG_CORE); + xfs_trans_log_inode(tp, ip, logflags); + + error = xfs_defer_finish(&tp, &dfops); + if (error) + goto trans_cancel; + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + return written > 0 ? written : error; +} + + STATIC ssize_t xfs_file_dio_aio_read( struct kiocb *iocb, @@ -192,7 +372,12 @@ xfs_file_dio_aio_read( file_accessed(iocb->ki_filp); xfs_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); + } else { + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + } xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -219,6 +404,13 @@ xfs_file_dax_read( xfs_ilock(ip, XFS_IOLOCK_SHARED); } + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); + } else { + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + } + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -242,7 +434,12 @@ xfs_file_buffered_aio_read( } else { xfs_ilock(ip, XFS_IOLOCK_SHARED); } - ret = generic_file_read_iter(iocb, to); + + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == XFS_DINODE_FMT_LOCAL) { + ret = xfs_file_inline_data_read(ip, XFS_DATA_FORK, iocb, to); + } else { + ret = generic_file_read_iter(iocb, to); + } xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -487,6 +684,7 @@ xfs_file_dio_aio_write( size_t count = iov_iter_count(from); struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; + int error; /* DIO must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) @@ -547,6 +745,22 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); + + if (xfs_sb_version_hasinlinedata(&mp->m_sb) && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE && + S_ISREG(inode->i_mode)) { + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, + from); + if (error) { + ret = error; + goto out; + } + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + ret = count; + goto out; + } + } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: xfs_iunlock(ip, iolock); @@ -586,6 +800,20 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); + + if (xfs_sb_version_hasinlinedata(&ip->i_mount->m_sb) && + ip->i_d.di_format != XFS_DINODE_FMT_BTREE && + S_ISREG(inode->i_mode)) { + error = xfs_file_inline_data_write(ip, XFS_DATA_FORK, + iocb, from); + if (error) + goto out; + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { + ret = count; + goto out; + } + } + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); @@ -614,6 +842,7 @@ xfs_file_buffered_aio_write( struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; ssize_t ret; int enospc = 0; int iolock; @@ -629,6 +858,17 @@ xfs_file_buffered_aio_write( if (ret) goto out; + if (xfs_sb_version_hasinlinedata(&mp->m_sb) && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE) { + ret = xfs_file_inline_data_write(ip, XFS_DATA_FORK, iocb, from); + if (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) == + XFS_DINODE_FMT_LOCAL){ + if (likely(ret >= 0)) + iocb->ki_pos += ret; + goto out; + } + } + /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode);
Implement read/write functions for inline data access and use them for buffered/DIO/DAX operations. Signed-off-by: Shan Hai <shan.hai@oracle.com> --- fs/xfs/xfs_file.c | 244 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 242 insertions(+), 2 deletions(-)