Message ID | 20190326190301.32365-5-rgoldwyn@suse.de (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/15] btrfs: create a mount option for dax | expand |
On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > the data from iomap->cow_addr to iomap->addr, if the start/end > of I/O are not page aligned. > > This also introduces dax_to_dax_copy() which performs a copy > from one part of the device to another, to a maximum of one page. > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > (or memset) from a hole. Would this be better handled through a flag? > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > --- > fs/dax.c | 36 ++++++++++++++++++++++++++++++++++++ > include/linux/iomap.h | 3 +++ > 2 files changed, 39 insertions(+) > > diff --git a/fs/dax.c b/fs/dax.c > index ca0671d55aa6..e254535dd830 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev, > return true; > } > > +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr, > + size_t len) > +{ > + loff_t blk_start, blk_pg; > + void *saddr; > + ssize_t map_len; > + > + /* A zero address is a hole. */ > + if (iomap->cow_addr == 0) { > + memset(daddr, 0, len); > + return; > + } > + > + blk_start = iomap->cow_addr + pos - iomap->cow_pos; > + blk_pg = round_down(blk_start, PAGE_SIZE); > + > + map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE, > + &saddr, NULL); > + saddr += blk_start - blk_pg; > + memcpy(daddr, saddr, len); > +} > + > int __dax_zero_page_range(struct block_device *bdev, > struct dax_device *dax_dev, sector_t sector, > unsigned int offset, unsigned int size) > @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > break; > } > > + if (iomap->flags & IOMAP_F_COW) { > + loff_t pg_end = round_up(end, PAGE_SIZE); > + /* > + * Copy the first part of the page > + * Note: we pass offset as length > + */ > + if (offset) > + dax_to_dax_copy(iomap, pos - offset, kaddr, offset); > + > + /* Copy the last part of the range */ > + if (end < pg_end) > + dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end); > + } > + > map_len = PFN_PHYS(map_len); > kaddr += offset; > map_len -= offset; > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 0fefb5455bda..391785de1428 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h Probably a good idea to cc the iomap maintainers on this (entire patchset)... <cough> --D > @@ -35,6 +35,7 @@ struct vm_fault; > #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ > #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ > #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ > +#define IOMAP_F_COW 0x08 /* cow before write */ > > /* > * Flags that only need to be reported for IOMAP_REPORT requests: > @@ -59,6 +60,8 @@ struct iomap { > u64 length; /* length of mapping, bytes */ > u16 type; /* type of mapping */ > u16 flags; /* flags for mapping */ > + u64 cow_addr; /* read address to perform CoW */ > + loff_t cow_pos; /* file offset of cow_addr */ > struct block_device *bdev; /* block device for I/O */ > struct dax_device *dax_dev; /* dax_dev for dax operations */ > void *inline_data; > -- > 2.16.4 >
On 10:54 27/03, Darrick J. Wong wrote: > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > > the data from iomap->cow_addr to iomap->addr, if the start/end > > of I/O are not page aligned. > > > > This also introduces dax_to_dax_copy() which performs a copy > > from one part of the device to another, to a maximum of one page. > > > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > > (or memset) from a hole. Would this be better handled through a flag? > > > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > > --- > > fs/dax.c | 36 ++++++++++++++++++++++++++++++++++++ > > include/linux/iomap.h | 3 +++ > > 2 files changed, 39 insertions(+) > > > > diff --git a/fs/dax.c b/fs/dax.c > > index ca0671d55aa6..e254535dd830 100644 > > --- a/fs/dax.c > > +++ b/fs/dax.c > > @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev, > > return true; > > } > > > > +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr, > > + size_t len) > > +{ > > + loff_t blk_start, blk_pg; > > + void *saddr; > > + ssize_t map_len; > > + > > + /* A zero address is a hole. */ > > + if (iomap->cow_addr == 0) { > > + memset(daddr, 0, len); > > + return; > > + } > > + > > + blk_start = iomap->cow_addr + pos - iomap->cow_pos; > > + blk_pg = round_down(blk_start, PAGE_SIZE); > > + > > + map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE, > > + &saddr, NULL); > > + saddr += blk_start - blk_pg; > > + memcpy(daddr, saddr, len); > > +} > > + > > int __dax_zero_page_range(struct block_device *bdev, > > struct dax_device *dax_dev, sector_t sector, > > unsigned int offset, unsigned int size) > > @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > break; > > } > > > > + if (iomap->flags & IOMAP_F_COW) { > > + loff_t pg_end = round_up(end, PAGE_SIZE); > > + /* > > + * Copy the first part of the page > > + * Note: we pass offset as length > > + */ > > + if (offset) > > + dax_to_dax_copy(iomap, pos - offset, kaddr, offset); > > + > > + /* Copy the last part of the range */ > > + if (end < pg_end) > > + dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end); > > + } > > + > > map_len = PFN_PHYS(map_len); > > kaddr += offset; > > map_len -= offset; > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > > index 0fefb5455bda..391785de1428 100644 > > --- a/include/linux/iomap.h > > +++ b/include/linux/iomap.h > > Probably a good idea to cc the iomap maintainers on this (entire > patchset)... <cough> Yup. Will include you and Christoph in the future iterations. This will not be the last iteration for this patchset. Looks like fsdevel and btrfs was too narrow a list ;) > > --D > > > @@ -35,6 +35,7 @@ struct vm_fault; > > #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ > > #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ > > #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ > > +#define IOMAP_F_COW 0x08 /* cow before write */ > > > > /* > > * Flags that only need to be reported for IOMAP_REPORT requests: > > @@ -59,6 +60,8 @@ struct iomap { > > u64 length; /* length of mapping, bytes */ > > u16 type; /* type of mapping */ > > u16 flags; /* flags for mapping */ > > + u64 cow_addr; /* read address to perform CoW */ > > + loff_t cow_pos; /* file offset of cow_addr */ > > struct block_device *bdev; /* block device for I/O */ > > struct dax_device *dax_dev; /* dax_dev for dax operations */ > > void *inline_data; > > -- > > 2.16.4 > > >
On Wed, Mar 27, 2019 at 01:58:53PM -0500, Goldwyn Rodrigues wrote: > On 10:54 27/03, Darrick J. Wong wrote: > > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > > > the data from iomap->cow_addr to iomap->addr, if the start/end > > > of I/O are not page aligned. > > > > > > This also introduces dax_to_dax_copy() which performs a copy > > > from one part of the device to another, to a maximum of one page. > > > > > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > > > (or memset) from a hole. Would this be better handled through a flag? > > > > > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > --- > > > fs/dax.c | 36 ++++++++++++++++++++++++++++++++++++ > > > include/linux/iomap.h | 3 +++ > > > 2 files changed, 39 insertions(+) > > > > > > diff --git a/fs/dax.c b/fs/dax.c > > > index ca0671d55aa6..e254535dd830 100644 > > > --- a/fs/dax.c > > > +++ b/fs/dax.c > > > @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev, > > > return true; > > > } > > > > > > +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr, > > > + size_t len) Hmm... is this a dax copy-in function? I think what's going on here is that we have a request to copy into the file @len bytes at offset @pos; @daddr is the pmemory address of file offset @pos, and the @iomap is supposed to have a cow address (or nothing). (Would love a comment here, even if it is a static helper.) > > > +{ > > > + loff_t blk_start, blk_pg; > > > + void *saddr; > > > + ssize_t map_len; > > > + > > > + /* A zero address is a hole. */ > > > + if (iomap->cow_addr == 0) { > > > + memset(daddr, 0, len); > > > + return; > > > + } > > > + > > > + blk_start = iomap->cow_addr + pos - iomap->cow_pos; > > > + blk_pg = round_down(blk_start, PAGE_SIZE); > > > + > > > + map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE, > > > + &saddr, NULL); /me wonders if we're supposed to do something with map_len here? > > > + saddr += blk_start - blk_pg; > > > + memcpy(daddr, saddr, len); > > > +} > > > + > > > int __dax_zero_page_range(struct block_device *bdev, > > > struct dax_device *dax_dev, sector_t sector, > > > unsigned int offset, unsigned int size) > > > @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, > > > break; > > > } > > > > > > + if (iomap->flags & IOMAP_F_COW) { > > > + loff_t pg_end = round_up(end, PAGE_SIZE); > > > + /* > > > + * Copy the first part of the page > > > + * Note: we pass offset as length > > > + */ > > > + if (offset) > > > + dax_to_dax_copy(iomap, pos - offset, kaddr, offset); > > > + > > > + /* Copy the last part of the range */ > > > + if (end < pg_end) > > > + dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end); > > > + } > > > + > > > map_len = PFN_PHYS(map_len); > > > kaddr += offset; > > > map_len -= offset; > > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > > > index 0fefb5455bda..391785de1428 100644 > > > --- a/include/linux/iomap.h > > > +++ b/include/linux/iomap.h > > > > Probably a good idea to cc the iomap maintainers on this (entire > > patchset)... <cough> > > Yup. Will include you and Christoph in the future iterations. This will > not be the last iteration for this patchset. > > Looks like fsdevel and btrfs was too narrow a list ;) Not necessarily, I /did/ see this on both lists. Just a friendly reminder that iomap has maintainers now and is no longer adrift in the VFS. :) > > > > --D > > > > > @@ -35,6 +35,7 @@ struct vm_fault; > > > #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ > > > #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ > > > #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ > > > +#define IOMAP_F_COW 0x08 /* cow before write */ This could use some more elaboration, since I couldn't figure out with certainty how this mechanism is supposed to work... > > > > > > /* > > > * Flags that only need to be reported for IOMAP_REPORT requests: > > > @@ -59,6 +60,8 @@ struct iomap { > > > u64 length; /* length of mapping, bytes */ > > > u16 type; /* type of mapping */ > > > u16 flags; /* flags for mapping */ > > > + u64 cow_addr; /* read address to perform CoW */ Oh, I see, IOMAP_F_COW means @cow_addr points to the copy source address, not the destination space we just allocated. /* * A copy-on-write operation is necessary to maintain data integrity. * Write actors must copy the portion of the file that will not be * overwritten by the write from @cow_addr to @addr. */ #define IOMAP_F_COW 0x08 > > > + loff_t cow_pos; /* file offset of cow_addr */ Why wouldn't this be named @cow_offset? And why wouldn't it be the same as @offset? --D > > > struct block_device *bdev; /* block device for I/O */ > > > struct dax_device *dax_dev; /* dax_dev for dax operations */ > > > void *inline_data; > > > -- > > > 2.16.4 > > > > > > > -- > Goldwyn
On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > the data from iomap->cow_addr to iomap->addr, if the start/end > of I/O are not page aligned. I see what you are trying to do here, but this is kinda gross. > This also introduces dax_to_dax_copy() which performs a copy > from one part of the device to another, to a maximum of one page. > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > (or memset) from a hole. Would this be better handled through a flag? That's what all these checks in the iomap code do: if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) Oh, wait, you're trying to map two ranges in a single iomap and then infer state from information that got chucked away.... IOWs, you're doing it wrong - iomap algorithms are driven by how we manipulate iomaps to do data operations efficiently, not how we copy data page by page. IOWs, what we really should have here is two iomaps - a source and a destination iomap. The source is a read mapping of the current address (where we are going to copy the data from), the destination is the post-cow allocation mapping (where the data goes). Now you just copy the data from one map to the other iterating source mappings until the necessary range of the destination has been covered. And you can check if the source map is IOMAP_HOLE or IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new allocation) before copying in the new data. Even better, only do the source mapping if the write isn't page/filesystem block aligned, and hence only do the slow path copying if a source mapping exists.... > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > index 0fefb5455bda..391785de1428 100644 > --- a/include/linux/iomap.h > +++ b/include/linux/iomap.h > @@ -35,6 +35,7 @@ struct vm_fault; > #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ > #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ > #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ > +#define IOMAP_F_COW 0x08 /* cow before write */ "Copy on write before write"? :) Cheers, Dave.
On 15:38 01/04, Dave Chinner wrote: > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > > the data from iomap->cow_addr to iomap->addr, if the start/end > > of I/O are not page aligned. > > I see what you are trying to do here, but this is kinda gross. > > > This also introduces dax_to_dax_copy() which performs a copy > > from one part of the device to another, to a maximum of one page. > > > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > > (or memset) from a hole. Would this be better handled through a flag? > > That's what all these checks in the iomap code do: > This is using iomap->flags not type. > if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) > > Oh, wait, you're trying to map two ranges in a single iomap and then > infer state from information that got chucked away.... IOWs, you're > doing it wrong - iomap algorithms are driven by how we manipulate > iomaps to do data operations efficiently, not how we copy data page > by page. > > IOWs, what we really should have here is two iomaps - a source > and a destination iomap. The source is a read mapping of the > current address (where we are going to copy the data from), the > destination is the post-cow allocation mapping (where the data > goes). > > Now you just copy the data from one map to the other iterating > source mappings until the necessary range of the destination has > been covered. And you can check if the source map is IOMAP_HOLE or > IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new > allocation) before copying in the new data. Won't that be inefficient? With CoW we only need to write the first and last block. Again, that is not required if the offset/end offset is block aligned. After that, it falls back to the regular write path of performing dax_copy_to_iter(). We don't deal with IOMAP_UNWRITTEN in dax, though other filesystems in the future may use CoW without dax. Besides, what you are suggesting will not fit well with the current iomap iterator code and would require another function altogether. It is better suited for like file comparison range: patch [11/16]. This would end up complicating/bloating the code. > > Even better, only do the source mapping if the write isn't > page/filesystem block aligned, and hence only do the slow path > copying if a source mapping exists.... > The code already does this. After Darrick's suggestion, we can even do away with cow_pos, so only the read address of cow_addr will exist. > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h > > index 0fefb5455bda..391785de1428 100644 > > --- a/include/linux/iomap.h > > +++ b/include/linux/iomap.h > > @@ -35,6 +35,7 @@ struct vm_fault; > > #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ > > #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ > > #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ > > +#define IOMAP_F_COW 0x08 /* cow before write */ > > "Copy on write before write"? :) Yes, I could use a better comment. Thanks for pointing.
On Mon, Apr 01, 2019 at 04:41:02PM -0500, Goldwyn Rodrigues wrote: > On 15:38 01/04, Dave Chinner wrote: > > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > > > the data from iomap->cow_addr to iomap->addr, if the start/end > > > of I/O are not page aligned. > > > > I see what you are trying to do here, but this is kinda gross. > > > > > This also introduces dax_to_dax_copy() which performs a copy > > > from one part of the device to another, to a maximum of one page. > > > > > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > > > (or memset) from a hole. Would this be better handled through a flag? > > > > That's what all these checks in the iomap code do: > > > > This is using iomap->flags not type. Yes, I know. The fact that you tell me this (when it was obvious) indicates to me that you didn't understand what I was saying. i.e. the gross hack here is that this patch is trying to define a new iomap type - both behaviourally and iomap content - via adding a modifier flag rather than defining a new iomap->type. That's the gross hack, and everything stems from that. i.e. the "bloating" of the struct iomap is caused because the flag modifier (IOMAP_F_COW) can't use parts of the iomap that are defined for specific iomap types. e.g. IOMAP_INLINE type uses ->inline_data, and so it can't be re-used by a iomap flag modifier such as IOMAP_F_COW. However, if we define a new type for this "need multiple mappings" iomap rather than a flag, we don't need any new fields in the struct iomap because we can use what already exists in the iomap. > > if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) > > > > Oh, wait, you're trying to map two ranges in a single iomap and then > > infer state from information that got chucked away.... IOWs, you're > > doing it wrong - iomap algorithms are driven by how we manipulate > > iomaps to do data operations efficiently, not how we copy data page > > by page. > > > > IOWs, what we really should have here is two iomaps - a source > > and a destination iomap. The source is a read mapping of the > > current address (where we are going to copy the data from), the > > destination is the post-cow allocation mapping (where the data > > goes). > > > > Now you just copy the data from one map to the other iterating > > source mappings until the necessary range of the destination has > > been covered. And you can check if the source map is IOMAP_HOLE or > > IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new > > allocation) before copying in the new data. > > Won't that be inefficient? With CoW we only need to write the first > and last block. You're assuming that partial data overwrites are the only case where this dax-to-dax copy of a file range is required. That assumption is false. i.e. FALLOC_FL_UNSHARE_RANGE for DAX requires iterating over the entire source range and copying all the contents to the newly allocated destination range. The partial block copying is just a short version of this limited to a single block. Sure, btrfs doesn't support FALLOC_FL_UNSHARE_RANGE, but if you're going to be adding support for reflink to DAX, the infrastructure needs to provide support for performing FALLOC_FL_UNSHARE_RANGE to break extent sharing efficiently. > Again, that is not required if the offset/end > offset is block aligned. After that, it falls back to the > regular write path of performing dax_copy_to_iter(). > We don't deal with IOMAP_UNWRITTEN in dax, Yes we do. fallocate() can lay down unwritten extents, and we can both read and write to them. See, for example, dax_iomap_actor() called from dax_iomap_rw() via iomap_apply() - it does not call dax_copy_to_iter() for reads if the range is IOMAP_HOLE or IOMAP_UNWRITTEN. > though other > filesystems in the future may use CoW without dax. That makes no sense. :/ > Besides, what you are suggesting will not fit well with the > current iomap iterator code and would require another function > altogether. I'm not concerned about that - I would much prefer we do things cleanly and properly rather than make expedient hacks for questionable benefit that we'll have to completely rework or remove the moment we implement DAX+reflink support in XFS. > After Darrick's suggestion, we can even do away with cow_pos, so > only the read address of cow_addr will exist. As I mentioned earlier, even that is not necessary. This is DAX - the iomap API and mapping functions can already return pointers to inline data, and DAX can effectively be considered inline data for the purposes of reading data. As I said, the problem here is you are trying to use flags to define a new type of iomap operation requires two mappings rather than one. IMO, we should be defining an IOMAP_DAX_COW /type/ and then define it to contain and behave as follows: - new destination region for data to be copied into is the same setup as IOMAP_MAPPED - existing shared data that may be needed for reading is mapped direct to device address by ->iomap_begin into iomap->inline_data - if the iomap infrastructure needs to copy original source data into destination, it copies directly from the memory address in iomap->inline_data into the directly mapped DAX desitination via memcpy(). This covers both the partial write COW case you are concerned with here, and the full-extent range copy case that FALLOC_FL_UNSHARE_RANGE requires. Cheers, Dave.
On 10:06 02/04, Dave Chinner wrote: > On Mon, Apr 01, 2019 at 04:41:02PM -0500, Goldwyn Rodrigues wrote: > > On 15:38 01/04, Dave Chinner wrote: > > > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > > > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > > > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > > > > the data from iomap->cow_addr to iomap->addr, if the start/end > > > > of I/O are not page aligned. > > > > > > I see what you are trying to do here, but this is kinda gross. > > > > > > > This also introduces dax_to_dax_copy() which performs a copy > > > > from one part of the device to another, to a maximum of one page. > > > > > > > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied > > > > (or memset) from a hole. Would this be better handled through a flag? > > > > > > That's what all these checks in the iomap code do: > > > > > > > This is using iomap->flags not type. > > Yes, I know. The fact that you tell me this (when it was obvious) > indicates to me that you didn't understand what I was saying. > > i.e. the gross hack here is that this patch is trying to define a > new iomap type - both behaviourally and iomap content - via adding > a modifier flag rather than defining a new iomap->type. That's the > gross hack, and everything stems from that. > > i.e. the "bloating" of the struct iomap is caused because the flag > modifier (IOMAP_F_COW) can't use parts of the iomap that are defined > for specific iomap types. e.g. IOMAP_INLINE type uses ->inline_data, > and so it can't be re-used by a iomap flag modifier such as > IOMAP_F_COW. > > However, if we define a new type for this "need multiple mappings" > iomap rather than a flag, we don't need any new fields in the struct > iomap because we can use what already exists in the iomap. > > > > if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) > > > > > > Oh, wait, you're trying to map two ranges in a single iomap and then > > > infer state from information that got chucked away.... IOWs, you're > > > doing it wrong - iomap algorithms are driven by how we manipulate > > > iomaps to do data operations efficiently, not how we copy data page > > > by page. > > > > > > IOWs, what we really should have here is two iomaps - a source > > > and a destination iomap. The source is a read mapping of the > > > current address (where we are going to copy the data from), the > > > destination is the post-cow allocation mapping (where the data > > > goes). > > > > > > Now you just copy the data from one map to the other iterating > > > source mappings until the necessary range of the destination has > > > been covered. And you can check if the source map is IOMAP_HOLE or > > > IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new > > > allocation) before copying in the new data. > > > > Won't that be inefficient? With CoW we only need to write the first > > and last block. > > You're assuming that partial data overwrites are the only case where > this dax-to-dax copy of a file range is required. That assumption is > false. > > i.e. FALLOC_FL_UNSHARE_RANGE for DAX requires iterating over the > entire source range and copying all the contents to the newly > allocated destination range. The partial block copying is just a > short version of this limited to a single block. > > Sure, btrfs doesn't support FALLOC_FL_UNSHARE_RANGE, but if you're > going to be adding support for reflink to DAX, the infrastructure > needs to provide support for performing FALLOC_FL_UNSHARE_RANGE > to break extent sharing efficiently. > > > Again, that is not required if the offset/end > > offset is block aligned. After that, it falls back to the > > regular write path of performing dax_copy_to_iter(). > > We don't deal with IOMAP_UNWRITTEN in dax, > > Yes we do. fallocate() can lay down unwritten extents, and we can > both read and write to them. See, for example, dax_iomap_actor() > called from dax_iomap_rw() via iomap_apply() - it does not call > dax_copy_to_iter() for reads if the range is IOMAP_HOLE or > IOMAP_UNWRITTEN. > > > though other > > filesystems in the future may use CoW without dax. > > That makes no sense. :/ > > > Besides, what you are suggesting will not fit well with the > > current iomap iterator code and would require another function > > altogether. > > I'm not concerned about that - I would much prefer we do things > cleanly and properly rather than make expedient hacks for > questionable benefit that we'll have to completely rework or remove > the moment we implement DAX+reflink support in XFS. > > > After Darrick's suggestion, we can even do away with cow_pos, so > > only the read address of cow_addr will exist. > > As I mentioned earlier, even that is not necessary. > > This is DAX - the iomap API and mapping functions can already return > pointers to inline data, and DAX can effectively be considered > inline data for the purposes of reading data. > > As I said, the problem here is you are trying to use flags to define > a new type of iomap operation requires two mappings rather than one. > IMO, we should be defining an IOMAP_DAX_COW /type/ and then define > it to contain and behave as follows: > > - new destination region for data to be copied into is the > same setup as IOMAP_MAPPED > - existing shared data that may be needed for reading is > mapped direct to device address by ->iomap_begin into > iomap->inline_data > - if the iomap infrastructure needs to copy original source > data into destination, it copies directly from the memory > address in iomap->inline_data into the directly mapped DAX > desitination via memcpy(). > > This covers both the partial write COW case you are concerned with > here, and the full-extent range copy case that > FALLOC_FL_UNSHARE_RANGE requires. > Ok, understood. However, we may have to differentiate between a CoW with FALLOC_FL_UNSHARE_RANGE because CoW will only (conditionally) copy the first and the last block wheras FALLOC_FL_UNSHARE_RANGE will copy the whole range. So, should it be another type? Also, this would require iomap_begin to translate block->dax address which I suppose can be a function in the dax code.
On Tue, Apr 02, 2019 at 08:56:31PM -0500, Goldwyn Rodrigues wrote: > On 10:06 02/04, Dave Chinner wrote: > > On Mon, Apr 01, 2019 at 04:41:02PM -0500, Goldwyn Rodrigues wrote: > > > After Darrick's suggestion, we can even do away with cow_pos, so > > > only the read address of cow_addr will exist. > > > > As I mentioned earlier, even that is not necessary. > > > > This is DAX - the iomap API and mapping functions can already return > > pointers to inline data, and DAX can effectively be considered > > inline data for the purposes of reading data. > > > > As I said, the problem here is you are trying to use flags to define > > a new type of iomap operation requires two mappings rather than one. > > IMO, we should be defining an IOMAP_DAX_COW /type/ and then define > > it to contain and behave as follows: > > > > - new destination region for data to be copied into is the > > same setup as IOMAP_MAPPED > > - existing shared data that may be needed for reading is > > mapped direct to device address by ->iomap_begin into > > iomap->inline_data > > - if the iomap infrastructure needs to copy original source > > data into destination, it copies directly from the memory > > address in iomap->inline_data into the directly mapped DAX > > desitination via memcpy(). > > > > This covers both the partial write COW case you are concerned with > > here, and the full-extent range copy case that > > FALLOC_FL_UNSHARE_RANGE requires. > > > > Ok, understood. However, we may have to differentiate between a CoW > with FALLOC_FL_UNSHARE_RANGE because CoW will only (conditionally) > copy the first and the last block wheras FALLOC_FL_UNSHARE_RANGE > will copy the whole range. So, should it be another type? I don't see why that would be necessary. The calling contexts are completely different, but both know exactly what they have to copy from the iomap's inline data. They fact they copy different parts of data from the iomap doesn't mean we need different types of iomaps for them... > Also, this would require iomap_begin to translate block->dax address > which I suppose can be a function in the dax code. bdev_dax_pgoff() + dax_direct_access() should already provide what we need here. Cheers, Dave.
On Mon, Apr 01, 2019 at 03:38:52PM +1100, Dave Chinner wrote: > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote: > > From: Goldwyn Rodrigues <rgoldwyn@suse.com> > > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy > > the data from iomap->cow_addr to iomap->addr, if the start/end > > of I/O are not page aligned. > > I see what you are trying to do here, but this is kinda gross. Even more gross than our current hack where for buffered I/O we return the previous address but still allocate delalloc blocks for the COW? ;-) This whole area is a mine field.. > Oh, wait, you're trying to map two ranges in a single iomap and then > infer state from information that got chucked away.... IOWs, you're > doing it wrong - iomap algorithms are driven by how we manipulate > iomaps to do data operations efficiently, not how we copy data page > by page. > > IOWs, what we really should have here is two iomaps - a source > and a destination iomap. The source is a read mapping of the > current address (where we are going to copy the data from), the > destination is the post-cow allocation mapping (where the data > goes). > > Now you just copy the data from one map to the other iterating > source mappings until the necessary range of the destination has > been covered. And you can check if the source map is IOMAP_HOLE or > IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new > allocation) before copying in the new data. > > Even better, only do the source mapping if the write isn't > page/filesystem block aligned, and hence only do the slow path > copying if a source mapping exists.... I looked into the do two maps instead of the mentioned above existing gross hack when doing the buffer_head removal for iomap. The major issue with it is that we need a way for the second recursive mapping to not take the inode-wide locks. I still think it is worth it (or move to per-fork or range locks) as it defintively is the much cleaner way to handle it. I just didn't bother to go there yet as I wanted the buffer_head removal and thus kept the existing gross hack for now. But moving to a two mapping scheme for read-modify-write ops would be a major improvement for sure.
diff --git a/fs/dax.c b/fs/dax.c index ca0671d55aa6..e254535dd830 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev, return true; } +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr, + size_t len) +{ + loff_t blk_start, blk_pg; + void *saddr; + ssize_t map_len; + + /* A zero address is a hole. */ + if (iomap->cow_addr == 0) { + memset(daddr, 0, len); + return; + } + + blk_start = iomap->cow_addr + pos - iomap->cow_pos; + blk_pg = round_down(blk_start, PAGE_SIZE); + + map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE, + &saddr, NULL); + saddr += blk_start - blk_pg; + memcpy(daddr, saddr, len); +} + int __dax_zero_page_range(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, unsigned int offset, unsigned int size) @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, break; } + if (iomap->flags & IOMAP_F_COW) { + loff_t pg_end = round_up(end, PAGE_SIZE); + /* + * Copy the first part of the page + * Note: we pass offset as length + */ + if (offset) + dax_to_dax_copy(iomap, pos - offset, kaddr, offset); + + /* Copy the last part of the range */ + if (end < pg_end) + dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end); + } + map_len = PFN_PHYS(map_len); kaddr += offset; map_len -= offset; diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 0fefb5455bda..391785de1428 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -35,6 +35,7 @@ struct vm_fault; #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ #define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ #define IOMAP_F_BUFFER_HEAD 0x04 /* file system requires buffer heads */ +#define IOMAP_F_COW 0x08 /* cow before write */ /* * Flags that only need to be reported for IOMAP_REPORT requests: @@ -59,6 +60,8 @@ struct iomap { u64 length; /* length of mapping, bytes */ u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ + u64 cow_addr; /* read address to perform CoW */ + loff_t cow_pos; /* file offset of cow_addr */ struct block_device *bdev; /* block device for I/O */ struct dax_device *dax_dev; /* dax_dev for dax operations */ void *inline_data;