Message ID | 20190621192828.28900-2-rgoldwyn@suse.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Btrfs iomap | expand |
On Fri, Jun 21, 2019 at 02:28:23PM -0500, Goldwyn Rodrigues wrote: > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > Introduces a new type IOMAP_COW, which means the data at offset > must be read from a srcmap and copied before performing the > write on the offset. > > The srcmap is used to identify where the read is to be performed > from. This is passed to iomap->begin(), which is supposed to > put in the details for reading, typically set with type IOMAP_READ. What is IOMAP_READ ? > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > --- > fs/dax.c | 8 +++++--- > fs/ext2/inode.c | 2 +- > fs/ext4/inode.c | 2 +- > fs/gfs2/bmap.c | 3 ++- > fs/internal.h | 2 +- > fs/iomap.c | 31 ++++++++++++++++--------------- > fs/xfs/xfs_iomap.c | 9 ++++++--- > include/linux/iomap.h | 4 +++- > 8 files changed, 35 insertions(+), 26 deletions(-) > > diff --git a/fs/dax.c b/fs/dax.c > index 2e48c7ebb973..80b9e2599223 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -1078,7 +1078,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); > > static loff_t > dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > - struct iomap *iomap) > + struct iomap *iomap, struct iomap *srcmap) > { > struct block_device *bdev = iomap->bdev; > struct dax_device *dax_dev = iomap->dax_dev; > @@ -1236,6 +1236,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > unsigned long vaddr = vmf->address; > loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; > struct iomap iomap = { 0 }; > + struct iomap srcmap = { 0 }; > unsigned flags = IOMAP_FAULT; > int error, major = 0; > bool write = vmf->flags & FAULT_FLAG_WRITE; > @@ -1280,7 +1281,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > * the file system block size to be equal the page size, which means > * that we never have to deal with more than a single extent here. > */ > - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); > + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); > if (iomap_errp) > *iomap_errp = error; > if (error) { > @@ -1460,6 +1461,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, > struct inode *inode = mapping->host; > vm_fault_t result = VM_FAULT_FALLBACK; > struct iomap iomap = { 0 }; > + struct iomap srcmap = { 0 }; > pgoff_t max_pgoff; > void *entry; > loff_t pos; > @@ -1534,7 +1536,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, > * to look up our filesystem block. > */ > pos = (loff_t)xas.xa_index << PAGE_SHIFT; > - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); > + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, &srcmap); Line too long? Also, I guess the DAX and directio write paths will just WARN_ON_ONCE if someone feeds them an IOMAP_COW type iomap? Ah, right, I guess the only filesystems that use iomap directio and iomap dax don't support COW. :) --D > if (error) > goto unlock_entry; > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index e474127dd255..f081f11980ad 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, > > #ifdef CONFIG_FS_DAX > static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > - unsigned flags, struct iomap *iomap) > + unsigned flags, struct iomap *iomap, struct iomap *srcmap) > { > unsigned int blkbits = inode->i_blkbits; > unsigned long first_block = offset >> blkbits; > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index c7f77c643008..a8017e0c302b 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -3437,7 +3437,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) > } > > static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > - unsigned flags, struct iomap *iomap) > + unsigned flags, struct iomap *iomap, struct iomap *srcmap) > { > struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > unsigned int blkbits = inode->i_blkbits; > diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c > index 93ea1d529aa3..affa0c4305b7 100644 > --- a/fs/gfs2/bmap.c > +++ b/fs/gfs2/bmap.c > @@ -1124,7 +1124,8 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, > } > > static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, > - unsigned flags, struct iomap *iomap) > + unsigned flags, struct iomap *iomap, > + struct iomap *srcmap) > { > struct gfs2_inode *ip = GFS2_I(inode); > struct metapath mp = { .mp_aheight = 1, }; > diff --git a/fs/internal.h b/fs/internal.h > index a48ef81be37d..79e495d86165 100644 > --- a/fs/internal.h > +++ b/fs/internal.h > @@ -188,7 +188,7 @@ extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, > * iomap support: > */ > typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, > - void *data, struct iomap *iomap); > + void *data, struct iomap *iomap, struct iomap *srcmap); > > loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, > unsigned flags, const struct iomap_ops *ops, void *data, > diff --git a/fs/iomap.c b/fs/iomap.c > index 23ef63fd1669..6648957af268 100644 > --- a/fs/iomap.c > +++ b/fs/iomap.c > @@ -41,6 +41,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, > const struct iomap_ops *ops, void *data, iomap_actor_t actor) > { > struct iomap iomap = { 0 }; > + struct iomap srcmap = { 0 }; > loff_t written = 0, ret; > > /* > @@ -55,7 +56,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, > * expose transient stale data. If the reserve fails, we can safely > * back out at this point as there is nothing to undo. > */ > - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); > + ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); > if (ret) > return ret; > if (WARN_ON(iomap.offset > pos)) > @@ -75,7 +76,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, > * we can do the copy-in page by page without having to worry about > * failures exposing transient data. > */ > - written = actor(inode, pos, length, data, &iomap); > + written = actor(inode, pos, length, data, &iomap, &srcmap); > > /* > * Now the data has been copied, commit the range we've copied. This > @@ -282,7 +283,7 @@ iomap_read_inline_data(struct inode *inode, struct page *page, > > static loff_t > iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > - struct iomap *iomap) > + struct iomap *iomap, struct iomap *srcmap) > { > struct iomap_readpage_ctx *ctx = data; > struct page *page = ctx->cur_page; > @@ -424,7 +425,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, > > static loff_t > iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > struct iomap_readpage_ctx *ctx = data; > loff_t done, ret; > @@ -444,7 +445,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, > ctx->cur_page_in_bio = false; > } > ret = iomap_readpage_actor(inode, pos + done, length - done, > - ctx, iomap); > + ctx, iomap, srcmap); > } > > return done; > @@ -796,7 +797,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, > > static loff_t > iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > - struct iomap *iomap) > + struct iomap *iomap, struct iomap *srcmap) > { > struct iov_iter *i = data; > long status = 0; > @@ -913,7 +914,7 @@ __iomap_read_page(struct inode *inode, loff_t offset) > > static loff_t > iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > - struct iomap *iomap) > + struct iomap *iomap, struct iomap *srcmap) > { > long status = 0; > ssize_t written = 0; > @@ -1002,7 +1003,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, > > static loff_t > iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > bool *did_zero = data; > loff_t written = 0; > @@ -1071,7 +1072,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); > > static loff_t > iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > struct page *page = data; > int ret; > @@ -1169,7 +1170,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, > > static loff_t > iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > - struct iomap *iomap) > + struct iomap *iomap, struct iomap *srcmap) > { > struct fiemap_ctx *ctx = data; > loff_t ret = length; > @@ -1343,7 +1344,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, > > static loff_t > iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > switch (iomap->type) { > case IOMAP_UNWRITTEN: > @@ -1389,7 +1390,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole); > > static loff_t > iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > switch (iomap->type) { > case IOMAP_HOLE: > @@ -1790,7 +1791,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, > > static loff_t > iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > struct iomap_dio *dio = data; > > @@ -2057,7 +2058,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) > * distinction between written and unwritten extents. > */ > static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, > - loff_t count, void *data, struct iomap *iomap) > + loff_t count, void *data, struct iomap *iomap, struct iomap *srcmap) > { > struct iomap_swapfile_info *isi = data; > int error; > @@ -2161,7 +2162,7 @@ EXPORT_SYMBOL_GPL(iomap_swapfile_activate); > > static loff_t > iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, > - void *data, struct iomap *iomap) > + void *data, struct iomap *iomap, struct iomap *srcmap) > { > sector_t *bno = data, addr; > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > index 63d323916bba..6116a75f03da 100644 > --- a/fs/xfs/xfs_iomap.c > +++ b/fs/xfs/xfs_iomap.c > @@ -925,7 +925,8 @@ xfs_file_iomap_begin( > loff_t offset, > loff_t length, > unsigned flags, > - struct iomap *iomap) > + struct iomap *iomap, > + struct iomap *srcmap) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > @@ -1148,7 +1149,8 @@ xfs_seek_iomap_begin( > loff_t offset, > loff_t length, > unsigned flags, > - struct iomap *iomap) > + struct iomap *iomap, > + struct iomap *srcmap) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > @@ -1234,7 +1236,8 @@ xfs_xattr_iomap_begin( > loff_t offset, > loff_t length, > unsigned flags, > - struct iomap *iomap) > + struct iomap *iomap, > + struct iomap *srcmap) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 2103b94cb1bf..f49767c7fd83 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -25,6 +25,7 @@ struct vm_fault; > #define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ > #define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ > #define IOMAP_INLINE 0x05 /* data inline in the inode */ > +#define IOMAP_COW 0x06 /* copy data from srcmap before writing */ > > /* > * Flags for all iomap mappings: > @@ -102,7 +103,8 @@ struct iomap_ops { > * The actual length is returned in iomap->length. > */ > int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, > - unsigned flags, struct iomap *iomap); > + unsigned flags, struct iomap *iomap, > + struct iomap *srcmap); > > /* > * Commit and/or unreserve space previous allocated using iomap_begin. > -- > 2.16.4 >
xfs will need to be updated to fill in the additional iomap for the COW case. Has this series been tested on xfs? I can't say I'm a huge fan of this two iomaps in one method call approach. I always though two separate iomap iterations would be nicer, but compared to that even the older hack with just the additional src_addr seems a little better.
On 9:07 24/06, Christoph Hellwig wrote: > xfs will need to be updated to fill in the additional iomap for the > COW case. Has this series been tested on xfs? > No, I have not tested this, or make xfs set IOMAP_COW. I will try to do it in the next iteration. > I can't say I'm a huge fan of this two iomaps in one method call > approach. I always though two separate iomap iterations would be nicer, > but compared to that even the older hack with just the additional > src_addr seems a little better. I am just expanding on your idea of using multiple iterations for the Cow case in the hope we can come out of a good design: 1. iomap_file_buffered_write calls iomap_apply with IOMAP_WRITE flag. which calls iomap_begin() for the respective filesystem. 2. btrfs_iomap_begin() sets up iomap->type as IOMAP_COW and fills iomap struct with read addr information. 3. iomap_apply() conditionally for IOMAP_COW calls do_cow(new function) and calls ops->iomap_begin() with flag IOMAP_COW_READ_DONE(new flag). 4. btrfs_iomap_begin() fills up iomap structure with write information. Step 3 seems out of place because iomap_apply should be iomap.type agnostic. Right? Should we be adding another flag IOMAP_COW_DONE, just to figure out that this is the "real" write for iomap_begin to fill iomap? If this is not how you imagined, could you elaborate on the dual iteration sequence?
On 17:46 21/06, Darrick J. Wong wrote: > On Fri, Jun 21, 2019 at 02:28:23PM -0500, Goldwyn Rodrigues wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > Introduces a new type IOMAP_COW, which means the data at offset > > must be read from a srcmap and copied before performing the > > write on the offset. > > > > The srcmap is used to identify where the read is to be performed > > from. This is passed to iomap->begin(), which is supposed to > > put in the details for reading, typically set with type IOMAP_READ. > > What is IOMAP_READ ? Badd comment. I should have written IOMAP_MAPPED or IOMAP_HOLE. > > > > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > > --- > > fs/dax.c | 8 +++++--- > > fs/ext2/inode.c | 2 +- > > fs/ext4/inode.c | 2 +- > > fs/gfs2/bmap.c | 3 ++- > > fs/internal.h | 2 +- > > fs/iomap.c | 31 ++++++++++++++++--------------- > > fs/xfs/xfs_iomap.c | 9 ++++++--- > > include/linux/iomap.h | 4 +++- > > 8 files changed, 35 insertions(+), 26 deletions(-) > > > > diff --git a/fs/dax.c b/fs/dax.c > > index 2e48c7ebb973..80b9e2599223 100644 > > --- a/fs/dax.c > > +++ b/fs/dax.c > > @@ -1078,7 +1078,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); > > > > static loff_t > > dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > - struct iomap *iomap) > > + struct iomap *iomap, struct iomap *srcmap) > > { > > struct block_device *bdev = iomap->bdev; > > struct dax_device *dax_dev = iomap->dax_dev; > > @@ -1236,6 +1236,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > > unsigned long vaddr = vmf->address; > > loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; > > struct iomap iomap = { 0 }; > > + struct iomap srcmap = { 0 }; > > unsigned flags = IOMAP_FAULT; > > int error, major = 0; > > bool write = vmf->flags & FAULT_FLAG_WRITE; > > @@ -1280,7 +1281,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > > * the file system block size to be equal the page size, which means > > * that we never have to deal with more than a single extent here. > > */ > > - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); > > + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); > > if (iomap_errp) > > *iomap_errp = error; > > if (error) { > > @@ -1460,6 +1461,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, > > struct inode *inode = mapping->host; > > vm_fault_t result = VM_FAULT_FALLBACK; > > struct iomap iomap = { 0 }; > > + struct iomap srcmap = { 0 }; > > pgoff_t max_pgoff; > > void *entry; > > loff_t pos; > > @@ -1534,7 +1536,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, > > * to look up our filesystem block. > > */ > > pos = (loff_t)xas.xa_index << PAGE_SHIFT; > > - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); > > + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, &srcmap); > > Line too long? > > Also, I guess the DAX and directio write paths will just WARN_ON_ONCE if > someone feeds them an IOMAP_COW type iomap? > > Ah, right, I guess the only filesystems that use iomap directio and > iomap dax don't support COW. :) Well, I am planning on adding direct I/O to btrfs using iomap, but am stuck with some bugs. So, I will add the WARN_ON(). > > --D > > > if (error) > > goto unlock_entry; > > > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > > index e474127dd255..f081f11980ad 100644 > > --- a/fs/ext2/inode.c > > +++ b/fs/ext2/inode.c > > @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, > > > > #ifdef CONFIG_FS_DAX > > static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > > - unsigned flags, struct iomap *iomap) > > + unsigned flags, struct iomap *iomap, struct iomap *srcmap) > > { > > unsigned int blkbits = inode->i_blkbits; > > unsigned long first_block = offset >> blkbits; > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > > index c7f77c643008..a8017e0c302b 100644 > > --- a/fs/ext4/inode.c > > +++ b/fs/ext4/inode.c > > @@ -3437,7 +3437,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) > > } > > > > static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, > > - unsigned flags, struct iomap *iomap) > > + unsigned flags, struct iomap *iomap, struct iomap *srcmap) > > { > > struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > > unsigned int blkbits = inode->i_blkbits; > > diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c > > index 93ea1d529aa3..affa0c4305b7 100644 > > --- a/fs/gfs2/bmap.c > > +++ b/fs/gfs2/bmap.c > > @@ -1124,7 +1124,8 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, > > } > > > > static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, > > - unsigned flags, struct iomap *iomap) > > + unsigned flags, struct iomap *iomap, > > + struct iomap *srcmap) > > { > > struct gfs2_inode *ip = GFS2_I(inode); > > struct metapath mp = { .mp_aheight = 1, }; > > diff --git a/fs/internal.h b/fs/internal.h > > index a48ef81be37d..79e495d86165 100644 > > --- a/fs/internal.h > > +++ b/fs/internal.h > > @@ -188,7 +188,7 @@ extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, > > * iomap support: > > */ > > typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, > > - void *data, struct iomap *iomap); > > + void *data, struct iomap *iomap, struct iomap *srcmap); > > > > loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, > > unsigned flags, const struct iomap_ops *ops, void *data, > > diff --git a/fs/iomap.c b/fs/iomap.c > > index 23ef63fd1669..6648957af268 100644 > > --- a/fs/iomap.c > > +++ b/fs/iomap.c > > @@ -41,6 +41,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, > > const struct iomap_ops *ops, void *data, iomap_actor_t actor) > > { > > struct iomap iomap = { 0 }; > > + struct iomap srcmap = { 0 }; > > loff_t written = 0, ret; > > > > /* > > @@ -55,7 +56,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, > > * expose transient stale data. If the reserve fails, we can safely > > * back out at this point as there is nothing to undo. > > */ > > - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); > > + ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); > > if (ret) > > return ret; > > if (WARN_ON(iomap.offset > pos)) > > @@ -75,7 +76,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, > > * we can do the copy-in page by page without having to worry about > > * failures exposing transient data. > > */ > > - written = actor(inode, pos, length, data, &iomap); > > + written = actor(inode, pos, length, data, &iomap, &srcmap); > > > > /* > > * Now the data has been copied, commit the range we've copied. This > > @@ -282,7 +283,7 @@ iomap_read_inline_data(struct inode *inode, struct page *page, > > > > static loff_t > > iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > - struct iomap *iomap) > > + struct iomap *iomap, struct iomap *srcmap) > > { > > struct iomap_readpage_ctx *ctx = data; > > struct page *page = ctx->cur_page; > > @@ -424,7 +425,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, > > > > static loff_t > > iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > struct iomap_readpage_ctx *ctx = data; > > loff_t done, ret; > > @@ -444,7 +445,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, > > ctx->cur_page_in_bio = false; > > } > > ret = iomap_readpage_actor(inode, pos + done, length - done, > > - ctx, iomap); > > + ctx, iomap, srcmap); > > } > > > > return done; > > @@ -796,7 +797,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, > > > > static loff_t > > iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > - struct iomap *iomap) > > + struct iomap *iomap, struct iomap *srcmap) > > { > > struct iov_iter *i = data; > > long status = 0; > > @@ -913,7 +914,7 @@ __iomap_read_page(struct inode *inode, loff_t offset) > > > > static loff_t > > iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > - struct iomap *iomap) > > + struct iomap *iomap, struct iomap *srcmap) > > { > > long status = 0; > > ssize_t written = 0; > > @@ -1002,7 +1003,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, > > > > static loff_t > > iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > bool *did_zero = data; > > loff_t written = 0; > > @@ -1071,7 +1072,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); > > > > static loff_t > > iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > struct page *page = data; > > int ret; > > @@ -1169,7 +1170,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, > > > > static loff_t > > iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > - struct iomap *iomap) > > + struct iomap *iomap, struct iomap *srcmap) > > { > > struct fiemap_ctx *ctx = data; > > loff_t ret = length; > > @@ -1343,7 +1344,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, > > > > static loff_t > > iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > switch (iomap->type) { > > case IOMAP_UNWRITTEN: > > @@ -1389,7 +1390,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole); > > > > static loff_t > > iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > switch (iomap->type) { > > case IOMAP_HOLE: > > @@ -1790,7 +1791,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, > > > > static loff_t > > iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > struct iomap_dio *dio = data; > > > > @@ -2057,7 +2058,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) > > * distinction between written and unwritten extents. > > */ > > static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, > > - loff_t count, void *data, struct iomap *iomap) > > + loff_t count, void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > struct iomap_swapfile_info *isi = data; > > int error; > > @@ -2161,7 +2162,7 @@ EXPORT_SYMBOL_GPL(iomap_swapfile_activate); > > > > static loff_t > > iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, > > - void *data, struct iomap *iomap) > > + void *data, struct iomap *iomap, struct iomap *srcmap) > > { > > sector_t *bno = data, addr; > > > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > > index 63d323916bba..6116a75f03da 100644 > > --- a/fs/xfs/xfs_iomap.c > > +++ b/fs/xfs/xfs_iomap.c > > @@ -925,7 +925,8 @@ xfs_file_iomap_begin( > > loff_t offset, > > loff_t length, > > unsigned flags, > > - struct iomap *iomap) > > + struct iomap *iomap, > > + struct iomap *srcmap) > > { > > struct xfs_inode *ip = XFS_I(inode); > > struct xfs_mount *mp = ip->i_mount; > > @@ -1148,7 +1149,8 @@ xfs_seek_iomap_begin( > > loff_t offset, > > loff_t length, > > unsigned flags, > > - struct iomap *iomap) > > + struct iomap *iomap, > > + struct iomap *srcmap) > > { > > struct xfs_inode *ip = XFS_I(inode); > > struct xfs_mount *mp = ip->i_mount; > > @@ -1234,7 +1236,8 @@ xfs_xattr_iomap_begin( > > loff_t offset, > > loff_t length, > > unsigned flags, > > - struct iomap *iomap) > > + struct iomap *iomap, > > + struct iomap *srcmap) > > { > > struct xfs_inode *ip = XFS_I(inode); > > struct xfs_mount *mp = ip->i_mount; > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > > index 2103b94cb1bf..f49767c7fd83 100644 > > --- a/include/linux/iomap.h > > +++ b/include/linux/iomap.h > > @@ -25,6 +25,7 @@ struct vm_fault; > > #define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ > > #define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ > > #define IOMAP_INLINE 0x05 /* data inline in the inode */ > > +#define IOMAP_COW 0x06 /* copy data from srcmap before writing */ > > > > /* > > * Flags for all iomap mappings: > > @@ -102,7 +103,8 @@ struct iomap_ops { > > * The actual length is returned in iomap->length. > > */ > > int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, > > - unsigned flags, struct iomap *iomap); > > + unsigned flags, struct iomap *iomap, > > + struct iomap *srcmap); > > > > /* > > * Commit and/or unreserve space previous allocated using iomap_begin. > > -- > > 2.16.4 > >
On 6/26/19 3:14 AM, Goldwyn Rodrigues wrote: > On 9:07 24/06, Christoph Hellwig wrote: >> xfs will need to be updated to fill in the additional iomap for the >> COW case. Has this series been tested on xfs? >> > > No, I have not tested this, or make xfs set IOMAP_COW. I will try to do > it in the next iteration. Hi Goldwyn, I'm willing to help you with this test on XFS. If you need any help, please feel free to tell me. :)
On Fri, Jun 21, 2019 at 05:46:24PM -0700, Darrick J. Wong wrote: > On Fri, Jun 21, 2019 at 02:28:23PM -0500, Goldwyn Rodrigues wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > Introduces a new type IOMAP_COW, which means the data at offset > > must be read from a srcmap and copied before performing the > > write on the offset. > > > > The srcmap is used to identify where the read is to be performed > > from. This is passed to iomap->begin(), which is supposed to > > put in the details for reading, typically set with type IOMAP_READ. > > What is IOMAP_READ ? The lack of flags. Which reminds me that our IOMAP_* types have pretty much gotten out of hand in how we use some flags that really are different types vs others that are modifiers. We'll need to clean this up a bit eventually. > > > > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > > --- > > fs/dax.c | 8 +++++--- > > fs/ext2/inode.c | 2 +- > > fs/ext4/inode.c | 2 +- > > fs/gfs2/bmap.c | 3 ++- > > fs/internal.h | 2 +- > > fs/iomap.c | 31 ++++++++++++++++--------------- > > fs/xfs/xfs_iomap.c | 9 ++++++--- > > include/linux/iomap.h | 4 +++- > > 8 files changed, 35 insertions(+), 26 deletions(-) > > > > diff --git a/fs/dax.c b/fs/dax.c > > index 2e48c7ebb973..80b9e2599223 100644 > > --- a/fs/dax.c > > +++ b/fs/dax.c > > @@ -1078,7 +1078,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); > > > > static loff_t > > dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > - struct iomap *iomap) > > + struct iomap *iomap, struct iomap *srcmap) > > { > > struct block_device *bdev = iomap->bdev; > > struct dax_device *dax_dev = iomap->dax_dev; > > @@ -1236,6 +1236,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > > unsigned long vaddr = vmf->address; > > loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; > > struct iomap iomap = { 0 }; > > + struct iomap srcmap = { 0 }; > > unsigned flags = IOMAP_FAULT; > > int error, major = 0; > > bool write = vmf->flags & FAULT_FLAG_WRITE; > > @@ -1280,7 +1281,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > > * the file system block size to be equal the page size, which means > > * that we never have to deal with more than a single extent here. > > */ > > - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); > > + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); > > if (iomap_errp) > > *iomap_errp = error; > > if (error) { > > @@ -1460,6 +1461,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, > > struct inode *inode = mapping->host; > > vm_fault_t result = VM_FAULT_FALLBACK; > > struct iomap iomap = { 0 }; > > + struct iomap srcmap = { 0 }; > > pgoff_t max_pgoff; > > void *entry; > > loff_t pos; > > @@ -1534,7 +1536,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, > > * to look up our filesystem block. > > */ > > pos = (loff_t)xas.xa_index << PAGE_SHIFT; > > - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); > > + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, &srcmap); > > Line too long? > > Also, I guess the DAX and directio write paths will just WARN_ON_ONCE if > someone feeds them an IOMAP_COW type iomap? > > Ah, right, I guess the only filesystems that use iomap directio and > iomap dax don't support COW. :) ? XFS does iomap based cow for direct I/O. But we don't use IOMAP_COW yey with this series as far as I can tell.
On Tue, Jun 25, 2019 at 02:14:42PM -0500, Goldwyn Rodrigues wrote: > > I can't say I'm a huge fan of this two iomaps in one method call > > approach. I always though two separate iomap iterations would be nicer, > > but compared to that even the older hack with just the additional > > src_addr seems a little better. > > I am just expanding on your idea of using multiple iterations for the Cow case > in the hope we can come out of a good design: > > 1. iomap_file_buffered_write calls iomap_apply with IOMAP_WRITE flag. > which calls iomap_begin() for the respective filesystem. > 2. btrfs_iomap_begin() sets up iomap->type as IOMAP_COW and fills iomap > struct with read addr information. > 3. iomap_apply() conditionally for IOMAP_COW calls do_cow(new function) > and calls ops->iomap_begin() with flag IOMAP_COW_READ_DONE(new flag). > 4. btrfs_iomap_begin() fills up iomap structure with write information. > > Step 3 seems out of place because iomap_apply should be iomap.type agnostic. > Right? > Should we be adding another flag IOMAP_COW_DONE, just to figure out that > this is the "real" write for iomap_begin to fill iomap? > > If this is not how you imagined, could you elaborate on the dual iteration > sequence? Here are my thoughts from dealing with this from a while ago, all XFS based of course. If iomap_file_buffered_write is called on a page that is inside a COW extent we have the following options: a) the page is updatodate or entirely overwritten. We cn just allocate new COW blocks and return them, and we are done b) the page is not/partially uptodate and not entirely overwritten. The latter case is the interesting one. My thought was that iff the IOMAP_F_SHARED flag is set __iomap_write_begin / iomap_read_page_sync will then have to retreive the source information in some form. My original plan was to just do a nested iomap_apply call, which would need a special nested flag to not duplicate any locking the file system might be holding between ->iomap_begin and ->iomap_end. The upside here is that there is no additional overhead for the non-COW path and the architecture looks relatively clean. The downside is that at least for XFS we usually have to look up the source information anyway before allocating the COW destination extent, so we'd have to cache that information somewhere or redo it, which would be rather pointless. At that point the idea of a srcaddr in the iomap becomes interesting again - while it looks a little ugly from the architectural POV it actually ends up having very practical benefits. > > > -- > Goldwyn ---end quoted text---
On 8:39 26/06, Christoph Hellwig wrote: > On Tue, Jun 25, 2019 at 02:14:42PM -0500, Goldwyn Rodrigues wrote: > > > I can't say I'm a huge fan of this two iomaps in one method call > > > approach. I always though two separate iomap iterations would be nicer, > > > but compared to that even the older hack with just the additional > > > src_addr seems a little better. > > > > I am just expanding on your idea of using multiple iterations for the Cow case > > in the hope we can come out of a good design: > > > > 1. iomap_file_buffered_write calls iomap_apply with IOMAP_WRITE flag. > > which calls iomap_begin() for the respective filesystem. > > 2. btrfs_iomap_begin() sets up iomap->type as IOMAP_COW and fills iomap > > struct with read addr information. > > 3. iomap_apply() conditionally for IOMAP_COW calls do_cow(new function) > > and calls ops->iomap_begin() with flag IOMAP_COW_READ_DONE(new flag). > > 4. btrfs_iomap_begin() fills up iomap structure with write information. > > > > Step 3 seems out of place because iomap_apply should be iomap.type agnostic. > > Right? > > Should we be adding another flag IOMAP_COW_DONE, just to figure out that > > this is the "real" write for iomap_begin to fill iomap? > > > > If this is not how you imagined, could you elaborate on the dual iteration > > sequence? > > Here are my thoughts from dealing with this from a while ago, all > XFS based of course. > > If iomap_file_buffered_write is called on a page that is inside a COW > extent we have the following options: > > a) the page is updatodate or entirely overwritten. We cn just allocate > new COW blocks and return them, and we are done > b) the page is not/partially uptodate and not entirely overwritten. > > The latter case is the interesting one. My thought was that iff the > IOMAP_F_SHARED flag is set __iomap_write_begin / iomap_read_page_sync > will then have to retreive the source information in some form. > > My original plan was to just do a nested iomap_apply call, which would > need a special nested flag to not duplicate any locking the file > system might be holding between ->iomap_begin and ->iomap_end. > > The upside here is that there is no additional overhead for the non-COW > path and the architecture looks relatively clean. The downside is that > at least for XFS we usually have to look up the source information > anyway before allocating the COW destination extent, so we'd have to > cache that information somewhere or redo it, which would be rather > pointless. At that point the idea of a srcaddr in the iomap becomes > interesting again - while it looks a little ugly from the architectural > POV it actually ends up having very practical benefits. So, do we move back to the design of adding an extra field of srcaddr? Honestly, I find the design of using an extra field srcaddr in iomap better and simpler versus passing additional iomap srcmap or multiple iterations. Also, should we add another iomap type IOMAP_COW, or (re)use the flag IOMAP_F_SHARED during writes? IOW iomap type vs iomap flag. Dave/Darrick, what are your thoughts?
On Wed, Jun 26, 2019 at 11:10:17AM -0500, Goldwyn Rodrigues wrote: > On 8:39 26/06, Christoph Hellwig wrote: > > On Tue, Jun 25, 2019 at 02:14:42PM -0500, Goldwyn Rodrigues wrote: > > > > I can't say I'm a huge fan of this two iomaps in one method call > > > > approach. I always though two separate iomap iterations would be nicer, > > > > but compared to that even the older hack with just the additional > > > > src_addr seems a little better. > > > > > > I am just expanding on your idea of using multiple iterations for the Cow case > > > in the hope we can come out of a good design: > > > > > > 1. iomap_file_buffered_write calls iomap_apply with IOMAP_WRITE flag. > > > which calls iomap_begin() for the respective filesystem. > > > 2. btrfs_iomap_begin() sets up iomap->type as IOMAP_COW and fills iomap > > > struct with read addr information. > > > 3. iomap_apply() conditionally for IOMAP_COW calls do_cow(new function) > > > and calls ops->iomap_begin() with flag IOMAP_COW_READ_DONE(new flag). > > > 4. btrfs_iomap_begin() fills up iomap structure with write information. > > > > > > Step 3 seems out of place because iomap_apply should be iomap.type agnostic. > > > Right? > > > Should we be adding another flag IOMAP_COW_DONE, just to figure out that > > > this is the "real" write for iomap_begin to fill iomap? > > > > > > If this is not how you imagined, could you elaborate on the dual iteration > > > sequence? > > > > Here are my thoughts from dealing with this from a while ago, all > > XFS based of course. > > > > If iomap_file_buffered_write is called on a page that is inside a COW > > extent we have the following options: > > > > a) the page is updatodate or entirely overwritten. We cn just allocate > > new COW blocks and return them, and we are done > > b) the page is not/partially uptodate and not entirely overwritten. > > > > The latter case is the interesting one. My thought was that iff the > > IOMAP_F_SHARED flag is set __iomap_write_begin / iomap_read_page_sync > > will then have to retreive the source information in some form. > > > > My original plan was to just do a nested iomap_apply call, which would > > need a special nested flag to not duplicate any locking the file > > system might be holding between ->iomap_begin and ->iomap_end. > > > > The upside here is that there is no additional overhead for the non-COW > > path and the architecture looks relatively clean. The downside is that > > at least for XFS we usually have to look up the source information > > anyway before allocating the COW destination extent, so we'd have to > > cache that information somewhere or redo it, which would be rather > > pointless. At that point the idea of a srcaddr in the iomap becomes > > interesting again - while it looks a little ugly from the architectural > > POV it actually ends up having very practical benefits. I think it's less complicated to pass both mappings out in a single ->iomap_begin call rather than have this dance where the fs tells iomap to call back for the read mapping and then iomap calls back for the read mapping with a special "don't take locks" flag. For XFS specifically this means we can serve both mappings with a single ILOCK cycle. > So, do we move back to the design of adding an extra field of srcaddr? TLDR: Please no. > Honestly, I find the design of using an extra field srcaddr in iomap better > and simpler versus passing additional iomap srcmap or multiple iterations. Putting my long-range planning hat on, the usage story (from the fs' perspective) here is: "iomap wants to know how a file write should map to a disk write. If we're doing a straight overwrite of disk blocks then I should send back the relevant mapping. Sometimes I might need the write to go to a totally different location than where the data is currently stored, so I need to send back a second mapping." Because iomap is now a general-purpose API, we need to think about the read mapping for a moment: - For all disk-based filesystems we'll want the source address for the read mapping. - For filesystems that support "inline data" (which really just means the fs maintains its own buffers to file data) we'll also need the inline_data pointer. - For filesystems that support multiple devices (like btrfs) we'll also need a pointer to a block_device because we could be writing to a different device than the one that stores the data. The prime example I can think of is reading data from disk A in some RAID stripe and writing to disk B in a different RAID stripe to solve the RAID5 hole... but you could just be lazy-migrating file data to less full or newer drives or whatever. - If we ever encounter a filesystem that supports multiple dax devices then we'll need a pointer to the dax_device too. (That would be btrfs, since I thought your larger goal was to enable dax there...) - We're probably going to need the ability to pass flags for the read mapping at some point or another, so we need that too. From this, you can see that we need half the fields in the existing struct iomap, and that's how I arrived at the idea of passing to iomap_begin pointers to two iomaps instead of bolting these five fields into struct iomap. In XFS parlance (where the data fork stores mappings for on-disk data and the cow fork stores mappings for), this means that our iomap_begin data paths remain fairly straightforward: xfs_bmapi_read(ip, offset, XFS_DATA_FORK, &imap...); xfs_bmapi_read(ip, offset, XFS_COW_FORK, &cmap...); xfs_bmbt_to_iomap(ip, iomap, &imap...); iomap->type = IOMAP_COW; xfs_bmbt_to_iomap(ip, srcmap, &cmap...); (It's more complicated than that, but that's approximately what we do now.) > Also, should we add another iomap type IOMAP_COW, or (re)use the flag > IOMAP_F_SHARED during writes? IOW iomap type vs iomap flag. I think they need to remain distinct types because the IOMAP_F_SHARED flag means that the storage is shared amongst multiple owners (which iomap_fiemap reports to userspace), whereas the IOMAP_COW type means that RMW is required for unaligned writes. btrfs has a strong tendency to require copy writes, but that doesn't mean the extent is necessarily shared. > Dave/Darrick, what are your thoughts? I liked where the first 3 patches of this series were heading. :) --D > > -- > Goldwyn
On Tue, Jun 25, 2019 at 02:14:42PM -0500, Goldwyn Rodrigues wrote: > On 9:07 24/06, Christoph Hellwig wrote: > > xfs will need to be updated to fill in the additional iomap for the > > COW case. Has this series been tested on xfs? > > > > No, I have not tested this, or make xfs set IOMAP_COW. I will try to do > it in the next iteration. AFAICT even if you did absolutely nothing XFS would continue to work properly because iomap_write_begin doesn't actually care if it's going to be a COW write because the only IO it does from the mapping is to read in the non-uptodate parts of the page if the write offset/len aren't page-aligned. > > I can't say I'm a huge fan of this two iomaps in one method call > > approach. I always though two separate iomap iterations would be nicer, > > but compared to that even the older hack with just the additional > > src_addr seems a little better. > > I am just expanding on your idea of using multiple iterations for the Cow case > in the hope we can come out of a good design: > > 1. iomap_file_buffered_write calls iomap_apply with IOMAP_WRITE flag. > which calls iomap_begin() for the respective filesystem. > 2. btrfs_iomap_begin() sets up iomap->type as IOMAP_COW and fills iomap > struct with read addr information. > 3. iomap_apply() conditionally for IOMAP_COW calls do_cow(new function) > and calls ops->iomap_begin() with flag IOMAP_COW_READ_DONE(new flag). Unless I'm misreading this, you don't need a do_cow() or IOMAP_COW_READ_DONE because the page state tracks that for you: iomap_write_begin calls ->iomap_begin to learn from where it should read data if the write is not aligned to a page and the page isn't uptodate. If it's IOMAP_COW then we learn from *srcmap instead of *iomap. (The write actor then dirties the page) fsync() or whatever The mm calls ->writepage. The filesystem grabs the new COW mapping, constructs a bio with the new mapping and dirty pages, and submits the bio. pagesize >= blocksize so we're always writing full blocks. The writeback bio completes and calls ->bio_endio, which is the filesystem's trigger to make the mapping changes permanent, update ondisk file size, etc. For direct writes that are not block-aligned, we just bounce the write to the page cache... ...so it's only dax_iomap_rw where we're going to have to do the COW ourselves. That's simple -- map both addresses, copy the regions before offset and after offset+len, then proceed with writing whatever userspace sent us. No need for the iomap code itself to get involved. > 4. btrfs_iomap_begin() fills up iomap structure with write information. > > Step 3 seems out of place because iomap_apply should be iomap.type agnostic. > Right? > Should we be adding another flag IOMAP_COW_DONE, just to figure out that > this is the "real" write for iomap_begin to fill iomap? > > If this is not how you imagined, could you elaborate on the dual iteration > sequence? --D > > > -- > Goldwyn
On 11:00 26/06, Darrick J. Wong wrote: > On Tue, Jun 25, 2019 at 02:14:42PM -0500, Goldwyn Rodrigues wrote: > > On 9:07 24/06, Christoph Hellwig wrote: > > > xfs will need to be updated to fill in the additional iomap for the > > > COW case. Has this series been tested on xfs? > > > > > > > No, I have not tested this, or make xfs set IOMAP_COW. I will try to do > > it in the next iteration. > > AFAICT even if you did absolutely nothing XFS would continue to work > properly because iomap_write_begin doesn't actually care if it's going > to be a COW write because the only IO it does from the mapping is to > read in the non-uptodate parts of the page if the write offset/len > aren't page-aligned. > > > > I can't say I'm a huge fan of this two iomaps in one method call > > > approach. I always though two separate iomap iterations would be nicer, > > > but compared to that even the older hack with just the additional > > > src_addr seems a little better. > > > > I am just expanding on your idea of using multiple iterations for the Cow case > > in the hope we can come out of a good design: > > > > 1. iomap_file_buffered_write calls iomap_apply with IOMAP_WRITE flag. > > which calls iomap_begin() for the respective filesystem. > > 2. btrfs_iomap_begin() sets up iomap->type as IOMAP_COW and fills iomap > > struct with read addr information. > > 3. iomap_apply() conditionally for IOMAP_COW calls do_cow(new function) > > and calls ops->iomap_begin() with flag IOMAP_COW_READ_DONE(new flag). > > Unless I'm misreading this, you don't need a do_cow() or > IOMAP_COW_READ_DONE because the page state tracks that for you: > > iomap_write_begin calls ->iomap_begin to learn from where it should read > data if the write is not aligned to a page and the page isn't uptodate. > If it's IOMAP_COW then we learn from *srcmap instead of *iomap. > We were discussing the single iomap structure (without srcmap), but with two iterations, the first to get the read information and the second to get the write information for CoW. > (The write actor then dirties the page) > > fsync() or whatever > > The mm calls ->writepage. The filesystem grabs the new COW mapping, > constructs a bio with the new mapping and dirty pages, and submits the > bio. pagesize >= blocksize so we're always writing full blocks. > > The writeback bio completes and calls ->bio_endio, which is the > filesystem's trigger to make the mapping changes permanent, update > ondisk file size, etc. > > For direct writes that are not block-aligned, we just bounce the write > to the page cache... > > ...so it's only dax_iomap_rw where we're going to have to do the COW > ourselves. That's simple -- map both addresses, copy the regions before > offset and after offset+len, then proceed with writing whatever > userspace sent us. No need for the iomap code itself to get involved. Yes, this is how the latest patch series is working, with two iomap structures in iomap_begin() function parameters.
diff --git a/fs/dax.c b/fs/dax.c index 2e48c7ebb973..80b9e2599223 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1078,7 +1078,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; @@ -1236,6 +1236,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; struct iomap iomap = { 0 }; + struct iomap srcmap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -1280,7 +1281,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); if (iomap_errp) *iomap_errp = error; if (error) { @@ -1460,6 +1461,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, struct inode *inode = mapping->host; vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; + struct iomap srcmap = { 0 }; pgoff_t max_pgoff; void *entry; loff_t pos; @@ -1534,7 +1536,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * to look up our filesystem block. */ pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, &srcmap); if (error) goto unlock_entry; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e474127dd255..f081f11980ad 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, #ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c7f77c643008..a8017e0c302b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3437,7 +3437,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int blkbits = inode->i_blkbits; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 93ea1d529aa3..affa0c4305b7 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1124,7 +1124,8 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, } static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap) + unsigned flags, struct iomap *iomap, + struct iomap *srcmap) { struct gfs2_inode *ip = GFS2_I(inode); struct metapath mp = { .mp_aheight = 1, }; diff --git a/fs/internal.h b/fs/internal.h index a48ef81be37d..79e495d86165 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -188,7 +188,7 @@ extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, * iomap support: */ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, - void *data, struct iomap *iomap); + void *data, struct iomap *iomap, struct iomap *srcmap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, diff --git a/fs/iomap.c b/fs/iomap.c index 23ef63fd1669..6648957af268 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -41,6 +41,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor) { struct iomap iomap = { 0 }; + struct iomap srcmap = { 0 }; loff_t written = 0, ret; /* @@ -55,7 +56,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, * expose transient stale data. If the reserve fails, we can safely * back out at this point as there is nothing to undo. */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); + ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); if (ret) return ret; if (WARN_ON(iomap.offset > pos)) @@ -75,7 +76,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, * we can do the copy-in page by page without having to worry about * failures exposing transient data. */ - written = actor(inode, pos, length, data, &iomap); + written = actor(inode, pos, length, data, &iomap, &srcmap); /* * Now the data has been copied, commit the range we've copied. This @@ -282,7 +283,7 @@ iomap_read_inline_data(struct inode *inode, struct page *page, static loff_t iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; struct page *page = ctx->cur_page; @@ -424,7 +425,7 @@ iomap_next_page(struct inode *inode, struct list_head *pages, loff_t pos, static loff_t iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_readpage_ctx *ctx = data; loff_t done, ret; @@ -444,7 +445,7 @@ iomap_readpages_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page_in_bio = false; } ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap); + ctx, iomap, srcmap); } return done; @@ -796,7 +797,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len, static loff_t iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct iov_iter *i = data; long status = 0; @@ -913,7 +914,7 @@ __iomap_read_page(struct inode *inode, loff_t offset) static loff_t iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { long status = 0; ssize_t written = 0; @@ -1002,7 +1003,7 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { bool *did_zero = data; loff_t written = 0; @@ -1071,7 +1072,7 @@ EXPORT_SYMBOL_GPL(iomap_truncate_page); static loff_t iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct page *page = data; int ret; @@ -1169,7 +1170,7 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, static loff_t iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap) + struct iomap *iomap, struct iomap *srcmap) { struct fiemap_ctx *ctx = data; loff_t ret = length; @@ -1343,7 +1344,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, static loff_t iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { switch (iomap->type) { case IOMAP_UNWRITTEN: @@ -1389,7 +1390,7 @@ EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { switch (iomap->type) { case IOMAP_HOLE: @@ -1790,7 +1791,7 @@ iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, static loff_t iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_dio *dio = data; @@ -2057,7 +2058,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) * distinction between written and unwritten extents. */ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap) + loff_t count, void *data, struct iomap *iomap, struct iomap *srcmap) { struct iomap_swapfile_info *isi = data; int error; @@ -2161,7 +2162,7 @@ EXPORT_SYMBOL_GPL(iomap_swapfile_activate); static loff_t iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap) + void *data, struct iomap *iomap, struct iomap *srcmap) { sector_t *bno = data, addr; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 63d323916bba..6116a75f03da 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -925,7 +925,8 @@ xfs_file_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1148,7 +1149,8 @@ xfs_seek_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1234,7 +1236,8 @@ xfs_xattr_iomap_begin( loff_t offset, loff_t length, unsigned flags, - struct iomap *iomap) + struct iomap *iomap, + struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 2103b94cb1bf..f49767c7fd83 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -25,6 +25,7 @@ struct vm_fault; #define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ #define IOMAP_INLINE 0x05 /* data inline in the inode */ +#define IOMAP_COW 0x06 /* copy data from srcmap before writing */ /* * Flags for all iomap mappings: @@ -102,7 +103,8 @@ struct iomap_ops { * The actual length is returned in iomap->length. */ int (*iomap_begin)(struct inode *inode, loff_t pos, loff_t length, - unsigned flags, struct iomap *iomap); + unsigned flags, struct iomap *iomap, + struct iomap *srcmap); /* * Commit and/or unreserve space previous allocated using iomap_begin.