[04/15] dax: Introduce IOMAP_F_COW for copy-on-write
diff mbox series

Message ID 20190326190301.32365-5-rgoldwyn@suse.de
State New
Headers show
Series
  • [01/15] btrfs: create a mount option for dax
Related show

Commit Message

Goldwyn Rodrigues March 26, 2019, 7:02 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

The IOMAP_F_COW is a flag to notify dax that it needs to copy
the data from iomap->cow_addr to iomap->addr, if the start/end
of I/O are not page aligned.

This also introduces dax_to_dax_copy() which performs a copy
from one part of the device to another, to a maximum of one page.

Question: Using iomap.cow_addr == 0 means the CoW is to be copied
(or memset) from a hole. Would this be better handled through a flag?

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/dax.c              | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/iomap.h |  3 +++
 2 files changed, 39 insertions(+)

Comments

Darrick J. Wong March 27, 2019, 5:54 p.m. UTC | #1
On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> The IOMAP_F_COW is a flag to notify dax that it needs to copy
> the data from iomap->cow_addr to iomap->addr, if the start/end
> of I/O are not page aligned.
> 
> This also introduces dax_to_dax_copy() which performs a copy
> from one part of the device to another, to a maximum of one page.
> 
> Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> (or memset) from a hole. Would this be better handled through a flag?
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/dax.c              | 36 ++++++++++++++++++++++++++++++++++++
>  include/linux/iomap.h |  3 +++
>  2 files changed, 39 insertions(+)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index ca0671d55aa6..e254535dd830 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev,
>  	return true;
>  }
>  
> +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr,
> +			    size_t len)
> +{
> +	loff_t blk_start, blk_pg;
> +	void *saddr;
> +	ssize_t map_len;
> +
> +	/* A zero address is a hole. */
> +	if (iomap->cow_addr == 0) {
> +		memset(daddr, 0, len);
> +		return;
> +	}
> +
> +	blk_start = iomap->cow_addr + pos - iomap->cow_pos;
> +	blk_pg = round_down(blk_start, PAGE_SIZE);
> +
> +	map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE,
> +			&saddr, NULL);
> +	saddr += blk_start - blk_pg;
> +	memcpy(daddr, saddr, len);
> +}
> +
>  int __dax_zero_page_range(struct block_device *bdev,
>  		struct dax_device *dax_dev, sector_t sector,
>  		unsigned int offset, unsigned int size)
> @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  			break;
>  		}
>  
> +		if (iomap->flags & IOMAP_F_COW) {
> +			loff_t pg_end = round_up(end, PAGE_SIZE);
> +			/*
> +			 * Copy the first part of the page
> +			 * Note: we pass offset as length
> +			 */
> +			if (offset)
> +				dax_to_dax_copy(iomap, pos - offset, kaddr, offset);
> +
> +			/* Copy the last part of the range */
> +			if (end < pg_end)
> +				dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end);
> +		}
> +
>  		map_len = PFN_PHYS(map_len);
>  		kaddr += offset;
>  		map_len -= offset;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 0fefb5455bda..391785de1428 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h

Probably a good idea to cc the iomap maintainers on this (entire
patchset)... <cough>

--D

> @@ -35,6 +35,7 @@ struct vm_fault;
>  #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
>  #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
>  #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
> +#define IOMAP_F_COW		0x08	/* cow before write */
>  
>  /*
>   * Flags that only need to be reported for IOMAP_REPORT requests:
> @@ -59,6 +60,8 @@ struct iomap {
>  	u64			length;	/* length of mapping, bytes */
>  	u16			type;	/* type of mapping */
>  	u16			flags;	/* flags for mapping */
> +	u64			cow_addr; /* read address to perform CoW */
> +	loff_t			cow_pos; /* file offset of cow_addr */
>  	struct block_device	*bdev;	/* block device for I/O */
>  	struct dax_device	*dax_dev; /* dax_dev for dax operations */
>  	void			*inline_data;
> -- 
> 2.16.4
>
Goldwyn Rodrigues March 27, 2019, 6:58 p.m. UTC | #2
On 10:54 27/03, Darrick J. Wong wrote:
> On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > 
> > The IOMAP_F_COW is a flag to notify dax that it needs to copy
> > the data from iomap->cow_addr to iomap->addr, if the start/end
> > of I/O are not page aligned.
> > 
> > This also introduces dax_to_dax_copy() which performs a copy
> > from one part of the device to another, to a maximum of one page.
> > 
> > Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> > (or memset) from a hole. Would this be better handled through a flag?
> > 
> > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > ---
> >  fs/dax.c              | 36 ++++++++++++++++++++++++++++++++++++
> >  include/linux/iomap.h |  3 +++
> >  2 files changed, 39 insertions(+)
> > 
> > diff --git a/fs/dax.c b/fs/dax.c
> > index ca0671d55aa6..e254535dd830 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev,
> >  	return true;
> >  }
> >  
> > +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr,
> > +			    size_t len)
> > +{
> > +	loff_t blk_start, blk_pg;
> > +	void *saddr;
> > +	ssize_t map_len;
> > +
> > +	/* A zero address is a hole. */
> > +	if (iomap->cow_addr == 0) {
> > +		memset(daddr, 0, len);
> > +		return;
> > +	}
> > +
> > +	blk_start = iomap->cow_addr + pos - iomap->cow_pos;
> > +	blk_pg = round_down(blk_start, PAGE_SIZE);
> > +
> > +	map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE,
> > +			&saddr, NULL);
> > +	saddr += blk_start - blk_pg;
> > +	memcpy(daddr, saddr, len);
> > +}
> > +
> >  int __dax_zero_page_range(struct block_device *bdev,
> >  		struct dax_device *dax_dev, sector_t sector,
> >  		unsigned int offset, unsigned int size)
> > @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> >  			break;
> >  		}
> >  
> > +		if (iomap->flags & IOMAP_F_COW) {
> > +			loff_t pg_end = round_up(end, PAGE_SIZE);
> > +			/*
> > +			 * Copy the first part of the page
> > +			 * Note: we pass offset as length
> > +			 */
> > +			if (offset)
> > +				dax_to_dax_copy(iomap, pos - offset, kaddr, offset);
> > +
> > +			/* Copy the last part of the range */
> > +			if (end < pg_end)
> > +				dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end);
> > +		}
> > +
> >  		map_len = PFN_PHYS(map_len);
> >  		kaddr += offset;
> >  		map_len -= offset;
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index 0fefb5455bda..391785de1428 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> 
> Probably a good idea to cc the iomap maintainers on this (entire
> patchset)... <cough>

Yup. Will include you and Christoph in the future iterations. This will
not be the last iteration for this patchset.

Looks like fsdevel and btrfs was too narrow a list ;)

> 
> --D
> 
> > @@ -35,6 +35,7 @@ struct vm_fault;
> >  #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
> >  #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
> >  #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
> > +#define IOMAP_F_COW		0x08	/* cow before write */
> >  
> >  /*
> >   * Flags that only need to be reported for IOMAP_REPORT requests:
> > @@ -59,6 +60,8 @@ struct iomap {
> >  	u64			length;	/* length of mapping, bytes */
> >  	u16			type;	/* type of mapping */
> >  	u16			flags;	/* flags for mapping */
> > +	u64			cow_addr; /* read address to perform CoW */
> > +	loff_t			cow_pos; /* file offset of cow_addr */
> >  	struct block_device	*bdev;	/* block device for I/O */
> >  	struct dax_device	*dax_dev; /* dax_dev for dax operations */
> >  	void			*inline_data;
> > -- 
> > 2.16.4
> > 
>
Darrick J. Wong March 28, 2019, 2:45 p.m. UTC | #3
On Wed, Mar 27, 2019 at 01:58:53PM -0500, Goldwyn Rodrigues wrote:
> On 10:54 27/03, Darrick J. Wong wrote:
> > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > 
> > > The IOMAP_F_COW is a flag to notify dax that it needs to copy
> > > the data from iomap->cow_addr to iomap->addr, if the start/end
> > > of I/O are not page aligned.
> > > 
> > > This also introduces dax_to_dax_copy() which performs a copy
> > > from one part of the device to another, to a maximum of one page.
> > > 
> > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> > > (or memset) from a hole. Would this be better handled through a flag?
> > > 
> > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > ---
> > >  fs/dax.c              | 36 ++++++++++++++++++++++++++++++++++++
> > >  include/linux/iomap.h |  3 +++
> > >  2 files changed, 39 insertions(+)
> > > 
> > > diff --git a/fs/dax.c b/fs/dax.c
> > > index ca0671d55aa6..e254535dd830 100644
> > > --- a/fs/dax.c
> > > +++ b/fs/dax.c
> > > @@ -1051,6 +1051,28 @@ static bool dax_range_is_aligned(struct block_device *bdev,
> > >  	return true;
> > >  }
> > >  
> > > +static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr,
> > > +			    size_t len)

Hmm... is this a dax copy-in function?  I think what's going on here is
that we have a request to copy into the file @len bytes at offset @pos;
@daddr is the pmemory address of file offset @pos, and the @iomap is
supposed to have a cow address (or nothing).

(Would love a comment here, even if it is a static helper.)

> > > +{
> > > +	loff_t blk_start, blk_pg;
> > > +	void *saddr;
> > > +	ssize_t map_len;
> > > +
> > > +	/* A zero address is a hole. */
> > > +	if (iomap->cow_addr == 0) {
> > > +		memset(daddr, 0, len);
> > > +		return;
> > > +	}
> > > +
> > > +	blk_start = iomap->cow_addr + pos - iomap->cow_pos;
> > > +	blk_pg = round_down(blk_start, PAGE_SIZE);
> > > +
> > > +	map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE,
> > > +			&saddr, NULL);

/me wonders if we're supposed to do something with map_len here?

> > > +	saddr += blk_start - blk_pg;
> > > +	memcpy(daddr, saddr, len);
> > > +}
> > > +
> > >  int __dax_zero_page_range(struct block_device *bdev,
> > >  		struct dax_device *dax_dev, sector_t sector,
> > >  		unsigned int offset, unsigned int size)
> > > @@ -1143,6 +1165,20 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> > >  			break;
> > >  		}
> > >  
> > > +		if (iomap->flags & IOMAP_F_COW) {
> > > +			loff_t pg_end = round_up(end, PAGE_SIZE);
> > > +			/*
> > > +			 * Copy the first part of the page
> > > +			 * Note: we pass offset as length
> > > +			 */
> > > +			if (offset)
> > > +				dax_to_dax_copy(iomap, pos - offset, kaddr, offset);
> > > +
> > > +			/* Copy the last part of the range */
> > > +			if (end < pg_end)
> > > +				dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end);
> > > +		}
> > > +
> > >  		map_len = PFN_PHYS(map_len);
> > >  		kaddr += offset;
> > >  		map_len -= offset;
> > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > > index 0fefb5455bda..391785de1428 100644
> > > --- a/include/linux/iomap.h
> > > +++ b/include/linux/iomap.h
> > 
> > Probably a good idea to cc the iomap maintainers on this (entire
> > patchset)... <cough>
> 
> Yup. Will include you and Christoph in the future iterations. This will
> not be the last iteration for this patchset.
> 
> Looks like fsdevel and btrfs was too narrow a list ;)

Not necessarily, I /did/ see this on both lists.  Just a friendly
reminder that iomap has maintainers now and is no longer adrift in the
VFS. :)

> > 
> > --D
> > 
> > > @@ -35,6 +35,7 @@ struct vm_fault;
> > >  #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
> > >  #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
> > >  #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
> > > +#define IOMAP_F_COW		0x08	/* cow before write */

This could use some more elaboration, since I couldn't figure out with
certainty how this mechanism is supposed to work...

> > >  
> > >  /*
> > >   * Flags that only need to be reported for IOMAP_REPORT requests:
> > > @@ -59,6 +60,8 @@ struct iomap {
> > >  	u64			length;	/* length of mapping, bytes */
> > >  	u16			type;	/* type of mapping */
> > >  	u16			flags;	/* flags for mapping */
> > > +	u64			cow_addr; /* read address to perform CoW */

Oh, I see, IOMAP_F_COW means @cow_addr points to the copy source
address, not the destination space we just allocated.

/*
 * A copy-on-write operation is necessary to maintain data integrity.
 * Write actors must copy the portion of the file that will not be
 * overwritten by the write from @cow_addr to @addr.
 */
#define IOMAP_F_COW		0x08

> > > +	loff_t			cow_pos; /* file offset of cow_addr */

Why wouldn't this be named @cow_offset?

And why wouldn't it be the same as @offset?

--D

> > >  	struct block_device	*bdev;	/* block device for I/O */
> > >  	struct dax_device	*dax_dev; /* dax_dev for dax operations */
> > >  	void			*inline_data;
> > > -- 
> > > 2.16.4
> > > 
> > 
> 
> -- 
> Goldwyn
Dave Chinner April 1, 2019, 4:38 a.m. UTC | #4
On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> The IOMAP_F_COW is a flag to notify dax that it needs to copy
> the data from iomap->cow_addr to iomap->addr, if the start/end
> of I/O are not page aligned.

I see what you are trying to do here, but this is kinda gross.

> This also introduces dax_to_dax_copy() which performs a copy
> from one part of the device to another, to a maximum of one page.
> 
> Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> (or memset) from a hole. Would this be better handled through a flag?

That's what all these checks in the iomap code do:

	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)

Oh, wait, you're trying to map two ranges in a single iomap and then
infer state from information that got chucked away.... IOWs, you're
doing it wrong - iomap algorithms are driven by how we manipulate
iomaps to do data operations efficiently, not how we copy data page
by page.

IOWs, what we really should have here is two iomaps - a source
and a destination iomap. The source is a read mapping of the
current address (where we are going to copy the data from), the
destination is the post-cow allocation mapping (where the data
goes).

Now you just copy the data from one map to the other iterating
source mappings until the necessary range of the destination has
been covered.  And you can check if the source map is IOMAP_HOLE or
IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new
allocation) before copying in the new data.

Even better, only do the source mapping if the write isn't
page/filesystem block aligned, and hence only do the slow path
copying if a source mapping exists....

> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 0fefb5455bda..391785de1428 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -35,6 +35,7 @@ struct vm_fault;
>  #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
>  #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
>  #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
> +#define IOMAP_F_COW		0x08	/* cow before write */

"Copy on write before write"? :)

Cheers,

Dave.
Goldwyn Rodrigues April 1, 2019, 9:41 p.m. UTC | #5
On 15:38 01/04, Dave Chinner wrote:
> On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > 
> > The IOMAP_F_COW is a flag to notify dax that it needs to copy
> > the data from iomap->cow_addr to iomap->addr, if the start/end
> > of I/O are not page aligned.
> 
> I see what you are trying to do here, but this is kinda gross.
> 
> > This also introduces dax_to_dax_copy() which performs a copy
> > from one part of the device to another, to a maximum of one page.
> > 
> > Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> > (or memset) from a hole. Would this be better handled through a flag?
> 
> That's what all these checks in the iomap code do:
> 

This is using iomap->flags not type.

> 	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
> 
> Oh, wait, you're trying to map two ranges in a single iomap and then
> infer state from information that got chucked away.... IOWs, you're
> doing it wrong - iomap algorithms are driven by how we manipulate
> iomaps to do data operations efficiently, not how we copy data page
> by page.
> 
> IOWs, what we really should have here is two iomaps - a source
> and a destination iomap. The source is a read mapping of the
> current address (where we are going to copy the data from), the
> destination is the post-cow allocation mapping (where the data
> goes).
> 
> Now you just copy the data from one map to the other iterating
> source mappings until the necessary range of the destination has
> been covered.  And you can check if the source map is IOMAP_HOLE or
> IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new
> allocation) before copying in the new data.

Won't that be inefficient? With CoW we only need to write the first
and last block. Again, that is not required if the offset/end
offset is block aligned. After that, it falls back to the
regular write path of performing dax_copy_to_iter().
We don't deal with IOMAP_UNWRITTEN in dax, though other
filesystems in the future may use CoW without dax.

Besides, what you are suggesting will not fit well with the
current iomap iterator code and would require another function
altogether. It is better suited for like file comparison range:
patch [11/16]. This would end up complicating/bloating the code.

> 
> Even better, only do the source mapping if the write isn't
> page/filesystem block aligned, and hence only do the slow path
> copying if a source mapping exists....
> 

The code already does this.
After Darrick's suggestion, we can even do away with cow_pos, so
only the read address of cow_addr will exist.

> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index 0fefb5455bda..391785de1428 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -35,6 +35,7 @@ struct vm_fault;
> >  #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
> >  #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
> >  #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
> > +#define IOMAP_F_COW		0x08	/* cow before write */
> 
> "Copy on write before write"? :)

Yes, I could use a better comment. Thanks for pointing.
Dave Chinner April 1, 2019, 11:06 p.m. UTC | #6
On Mon, Apr 01, 2019 at 04:41:02PM -0500, Goldwyn Rodrigues wrote:
> On 15:38 01/04, Dave Chinner wrote:
> > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > 
> > > The IOMAP_F_COW is a flag to notify dax that it needs to copy
> > > the data from iomap->cow_addr to iomap->addr, if the start/end
> > > of I/O are not page aligned.
> > 
> > I see what you are trying to do here, but this is kinda gross.
> > 
> > > This also introduces dax_to_dax_copy() which performs a copy
> > > from one part of the device to another, to a maximum of one page.
> > > 
> > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> > > (or memset) from a hole. Would this be better handled through a flag?
> > 
> > That's what all these checks in the iomap code do:
> > 
> 
> This is using iomap->flags not type.

Yes, I know. The fact that you tell me this (when it was obvious)
indicates to me that you didn't understand what I was saying.

i.e. the gross hack here is that this patch is trying to define a
new iomap type - both behaviourally and iomap content - via adding
a modifier flag rather than defining a new iomap->type. That's the
gross hack, and everything stems from that.

i.e. the "bloating" of the struct iomap is caused because the flag
modifier (IOMAP_F_COW) can't use parts of the iomap that are defined
for specific iomap types. e.g. IOMAP_INLINE type uses ->inline_data,
and so it can't be re-used by a iomap flag modifier such as
IOMAP_F_COW.

However, if we define a new type for this "need multiple mappings"
iomap rather than a flag, we don't need any new fields in the struct
iomap because we can use what already exists in the iomap.

> > 	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
> > 
> > Oh, wait, you're trying to map two ranges in a single iomap and then
> > infer state from information that got chucked away.... IOWs, you're
> > doing it wrong - iomap algorithms are driven by how we manipulate
> > iomaps to do data operations efficiently, not how we copy data page
> > by page.
> > 
> > IOWs, what we really should have here is two iomaps - a source
> > and a destination iomap. The source is a read mapping of the
> > current address (where we are going to copy the data from), the
> > destination is the post-cow allocation mapping (where the data
> > goes).
> > 
> > Now you just copy the data from one map to the other iterating
> > source mappings until the necessary range of the destination has
> > been covered.  And you can check if the source map is IOMAP_HOLE or
> > IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new
> > allocation) before copying in the new data.
> 
> Won't that be inefficient? With CoW we only need to write the first
> and last block.

You're assuming that partial data overwrites are the only case where
this dax-to-dax copy of a file range is required. That assumption is
false.

i.e. FALLOC_FL_UNSHARE_RANGE for DAX requires iterating over the
entire source range and copying all the contents to the newly
allocated destination range.  The partial block copying is just a
short version of this limited to a single block.

Sure, btrfs doesn't support FALLOC_FL_UNSHARE_RANGE, but if you're
going to be adding support for reflink to DAX, the infrastructure
needs to provide support for performing FALLOC_FL_UNSHARE_RANGE
to break extent sharing efficiently.

> Again, that is not required if the offset/end
> offset is block aligned. After that, it falls back to the
> regular write path of performing dax_copy_to_iter().
> We don't deal with IOMAP_UNWRITTEN in dax,

Yes we do. fallocate() can lay down unwritten extents, and we can
both read and write to them. See, for example, dax_iomap_actor()
called from dax_iomap_rw() via iomap_apply() - it does not call
dax_copy_to_iter() for reads if the range is IOMAP_HOLE or
IOMAP_UNWRITTEN.

> though other
> filesystems in the future may use CoW without dax.

That makes no sense. :/

> Besides, what you are suggesting will not fit well with the
> current iomap iterator code and would require another function
> altogether.

I'm not concerned about that - I would much prefer we do things
cleanly and properly rather than make expedient hacks for
questionable benefit that we'll have to completely rework or remove
the moment we implement DAX+reflink support in XFS.

> After Darrick's suggestion, we can even do away with cow_pos, so
> only the read address of cow_addr will exist.

As I mentioned earlier, even that is not necessary.

This is DAX - the iomap API and mapping functions can already return
pointers to inline data, and DAX can effectively be considered
inline data for the purposes of reading data.

As I said, the problem here is you are trying to use flags to define
a new type of iomap operation requires two mappings rather than one.
IMO, we should be defining an IOMAP_DAX_COW /type/ and then define
it to contain and behave as follows:

	- new destination region for data to be copied into is the
	  same setup as IOMAP_MAPPED
	- existing shared data that may be needed for reading is
	  mapped direct to device address by ->iomap_begin into
	  iomap->inline_data
	- if the iomap infrastructure needs to copy original source
	  data into destination, it copies directly from the memory
	  address in iomap->inline_data into the directly mapped DAX
	  desitination via memcpy().

This covers both the partial write COW case you are concerned with
here, and the full-extent range copy case that
FALLOC_FL_UNSHARE_RANGE requires.

Cheers,

Dave.
Goldwyn Rodrigues April 3, 2019, 1:56 a.m. UTC | #7
On 10:06 02/04, Dave Chinner wrote:
> On Mon, Apr 01, 2019 at 04:41:02PM -0500, Goldwyn Rodrigues wrote:
> > On 15:38 01/04, Dave Chinner wrote:
> > > On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> > > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > > 
> > > > The IOMAP_F_COW is a flag to notify dax that it needs to copy
> > > > the data from iomap->cow_addr to iomap->addr, if the start/end
> > > > of I/O are not page aligned.
> > > 
> > > I see what you are trying to do here, but this is kinda gross.
> > > 
> > > > This also introduces dax_to_dax_copy() which performs a copy
> > > > from one part of the device to another, to a maximum of one page.
> > > > 
> > > > Question: Using iomap.cow_addr == 0 means the CoW is to be copied
> > > > (or memset) from a hole. Would this be better handled through a flag?
> > > 
> > > That's what all these checks in the iomap code do:
> > > 
> > 
> > This is using iomap->flags not type.
> 
> Yes, I know. The fact that you tell me this (when it was obvious)
> indicates to me that you didn't understand what I was saying.
> 
> i.e. the gross hack here is that this patch is trying to define a
> new iomap type - both behaviourally and iomap content - via adding
> a modifier flag rather than defining a new iomap->type. That's the
> gross hack, and everything stems from that.
> 
> i.e. the "bloating" of the struct iomap is caused because the flag
> modifier (IOMAP_F_COW) can't use parts of the iomap that are defined
> for specific iomap types. e.g. IOMAP_INLINE type uses ->inline_data,
> and so it can't be re-used by a iomap flag modifier such as
> IOMAP_F_COW.
> 
> However, if we define a new type for this "need multiple mappings"
> iomap rather than a flag, we don't need any new fields in the struct
> iomap because we can use what already exists in the iomap.
> 
> > > 	if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
> > > 
> > > Oh, wait, you're trying to map two ranges in a single iomap and then
> > > infer state from information that got chucked away.... IOWs, you're
> > > doing it wrong - iomap algorithms are driven by how we manipulate
> > > iomaps to do data operations efficiently, not how we copy data page
> > > by page.
> > > 
> > > IOWs, what we really should have here is two iomaps - a source
> > > and a destination iomap. The source is a read mapping of the
> > > current address (where we are going to copy the data from), the
> > > destination is the post-cow allocation mapping (where the data
> > > goes).
> > > 
> > > Now you just copy the data from one map to the other iterating
> > > source mappings until the necessary range of the destination has
> > > been covered.  And you can check if the source map is IOMAP_HOLE or
> > > IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new
> > > allocation) before copying in the new data.
> > 
> > Won't that be inefficient? With CoW we only need to write the first
> > and last block.
> 
> You're assuming that partial data overwrites are the only case where
> this dax-to-dax copy of a file range is required. That assumption is
> false.
> 
> i.e. FALLOC_FL_UNSHARE_RANGE for DAX requires iterating over the
> entire source range and copying all the contents to the newly
> allocated destination range.  The partial block copying is just a
> short version of this limited to a single block.
> 
> Sure, btrfs doesn't support FALLOC_FL_UNSHARE_RANGE, but if you're
> going to be adding support for reflink to DAX, the infrastructure
> needs to provide support for performing FALLOC_FL_UNSHARE_RANGE
> to break extent sharing efficiently.
> 
> > Again, that is not required if the offset/end
> > offset is block aligned. After that, it falls back to the
> > regular write path of performing dax_copy_to_iter().
> > We don't deal with IOMAP_UNWRITTEN in dax,
> 
> Yes we do. fallocate() can lay down unwritten extents, and we can
> both read and write to them. See, for example, dax_iomap_actor()
> called from dax_iomap_rw() via iomap_apply() - it does not call
> dax_copy_to_iter() for reads if the range is IOMAP_HOLE or
> IOMAP_UNWRITTEN.
> 
> > though other
> > filesystems in the future may use CoW without dax.
> 
> That makes no sense. :/
> 
> > Besides, what you are suggesting will not fit well with the
> > current iomap iterator code and would require another function
> > altogether.
> 
> I'm not concerned about that - I would much prefer we do things
> cleanly and properly rather than make expedient hacks for
> questionable benefit that we'll have to completely rework or remove
> the moment we implement DAX+reflink support in XFS.
> 
> > After Darrick's suggestion, we can even do away with cow_pos, so
> > only the read address of cow_addr will exist.
> 
> As I mentioned earlier, even that is not necessary.
> 
> This is DAX - the iomap API and mapping functions can already return
> pointers to inline data, and DAX can effectively be considered
> inline data for the purposes of reading data.
> 
> As I said, the problem here is you are trying to use flags to define
> a new type of iomap operation requires two mappings rather than one.
> IMO, we should be defining an IOMAP_DAX_COW /type/ and then define
> it to contain and behave as follows:
> 
> 	- new destination region for data to be copied into is the
> 	  same setup as IOMAP_MAPPED
> 	- existing shared data that may be needed for reading is
> 	  mapped direct to device address by ->iomap_begin into
> 	  iomap->inline_data
> 	- if the iomap infrastructure needs to copy original source
> 	  data into destination, it copies directly from the memory
> 	  address in iomap->inline_data into the directly mapped DAX
> 	  desitination via memcpy().
> 
> This covers both the partial write COW case you are concerned with
> here, and the full-extent range copy case that
> FALLOC_FL_UNSHARE_RANGE requires.
> 

Ok, understood. However, we may have to differentiate between a CoW
with FALLOC_FL_UNSHARE_RANGE because CoW will only (conditionally)
copy the first and the last block wheras FALLOC_FL_UNSHARE_RANGE
will copy the whole range. So, should it be another type?

Also, this would require iomap_begin to translate block->dax address
which I suppose can be a function in the dax code.
Dave Chinner April 3, 2019, 3:20 a.m. UTC | #8
On Tue, Apr 02, 2019 at 08:56:31PM -0500, Goldwyn Rodrigues wrote:
> On 10:06 02/04, Dave Chinner wrote:
> > On Mon, Apr 01, 2019 at 04:41:02PM -0500, Goldwyn Rodrigues wrote:
> > > After Darrick's suggestion, we can even do away with cow_pos, so
> > > only the read address of cow_addr will exist.
> > 
> > As I mentioned earlier, even that is not necessary.
> > 
> > This is DAX - the iomap API and mapping functions can already return
> > pointers to inline data, and DAX can effectively be considered
> > inline data for the purposes of reading data.
> > 
> > As I said, the problem here is you are trying to use flags to define
> > a new type of iomap operation requires two mappings rather than one.
> > IMO, we should be defining an IOMAP_DAX_COW /type/ and then define
> > it to contain and behave as follows:
> > 
> > 	- new destination region for data to be copied into is the
> > 	  same setup as IOMAP_MAPPED
> > 	- existing shared data that may be needed for reading is
> > 	  mapped direct to device address by ->iomap_begin into
> > 	  iomap->inline_data
> > 	- if the iomap infrastructure needs to copy original source
> > 	  data into destination, it copies directly from the memory
> > 	  address in iomap->inline_data into the directly mapped DAX
> > 	  desitination via memcpy().
> > 
> > This covers both the partial write COW case you are concerned with
> > here, and the full-extent range copy case that
> > FALLOC_FL_UNSHARE_RANGE requires.
> > 
> 
> Ok, understood. However, we may have to differentiate between a CoW
> with FALLOC_FL_UNSHARE_RANGE because CoW will only (conditionally)
> copy the first and the last block wheras FALLOC_FL_UNSHARE_RANGE
> will copy the whole range. So, should it be another type?

I don't see why that would be necessary. The calling contexts are
completely different, but both know exactly what they have to copy
from the iomap's inline data. They fact they copy different parts of
data from the iomap doesn't mean we need different types of iomaps
for them...

> Also, this would require iomap_begin to translate block->dax address
> which I suppose can be a function in the dax code.

bdev_dax_pgoff() + dax_direct_access() should already provide what
we need here.

Cheers,

Dave.
Christoph Hellwig April 7, 2019, 7:26 a.m. UTC | #9
On Mon, Apr 01, 2019 at 03:38:52PM +1100, Dave Chinner wrote:
> On Tue, Mar 26, 2019 at 02:02:50PM -0500, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > 
> > The IOMAP_F_COW is a flag to notify dax that it needs to copy
> > the data from iomap->cow_addr to iomap->addr, if the start/end
> > of I/O are not page aligned.
> 
> I see what you are trying to do here, but this is kinda gross.

Even more gross than our current hack where for buffered I/O
we return the previous address but still allocate delalloc blocks
for the COW? ;-)

This whole area is a mine field..

> Oh, wait, you're trying to map two ranges in a single iomap and then
> infer state from information that got chucked away.... IOWs, you're
> doing it wrong - iomap algorithms are driven by how we manipulate
> iomaps to do data operations efficiently, not how we copy data page
> by page.
> 
> IOWs, what we really should have here is two iomaps - a source
> and a destination iomap. The source is a read mapping of the
> current address (where we are going to copy the data from), the
> destination is the post-cow allocation mapping (where the data
> goes).
> 
> Now you just copy the data from one map to the other iterating
> source mappings until the necessary range of the destination has
> been covered.  And you can check if the source map is IOMAP_HOLE or
> IOMAP_UNWRITTEN and hence optimise the copy (i.e. zero the new
> allocation) before copying in the new data.
> 
> Even better, only do the source mapping if the write isn't
> page/filesystem block aligned, and hence only do the slow path
> copying if a source mapping exists....

I looked into the do two maps instead of the mentioned above existing
gross hack when doing the buffer_head removal for iomap.  The major
issue with it is that we need a way for the second recursive mapping
to not take the inode-wide locks.  I still think it is worth it
(or move to per-fork or range locks) as it defintively is the much
cleaner way to handle it.  I just didn't bother to go there yet
as I wanted the buffer_head removal and thus kept the existing gross
hack for now.  But moving to a two mapping scheme for read-modify-write
ops would be a major improvement for sure.

Patch
diff mbox series

diff --git a/fs/dax.c b/fs/dax.c
index ca0671d55aa6..e254535dd830 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1051,6 +1051,28 @@  static bool dax_range_is_aligned(struct block_device *bdev,
 	return true;
 }
 
+static void dax_to_dax_copy(struct iomap *iomap, loff_t pos, void *daddr,
+			    size_t len)
+{
+	loff_t blk_start, blk_pg;
+	void *saddr;
+	ssize_t map_len;
+
+	/* A zero address is a hole. */
+	if (iomap->cow_addr == 0) {
+		memset(daddr, 0, len);
+		return;
+	}
+
+	blk_start = iomap->cow_addr + pos - iomap->cow_pos;
+	blk_pg = round_down(blk_start, PAGE_SIZE);
+
+	map_len = dax_direct_access(iomap->dax_dev, PHYS_PFN(blk_pg), PAGE_SIZE,
+			&saddr, NULL);
+	saddr += blk_start - blk_pg;
+	memcpy(daddr, saddr, len);
+}
+
 int __dax_zero_page_range(struct block_device *bdev,
 		struct dax_device *dax_dev, sector_t sector,
 		unsigned int offset, unsigned int size)
@@ -1143,6 +1165,20 @@  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			break;
 		}
 
+		if (iomap->flags & IOMAP_F_COW) {
+			loff_t pg_end = round_up(end, PAGE_SIZE);
+			/*
+			 * Copy the first part of the page
+			 * Note: we pass offset as length
+			 */
+			if (offset)
+				dax_to_dax_copy(iomap, pos - offset, kaddr, offset);
+
+			/* Copy the last part of the range */
+			if (end < pg_end)
+				dax_to_dax_copy(iomap, end, kaddr + offset + length, pg_end - end);
+		}
+
 		map_len = PFN_PHYS(map_len);
 		kaddr += offset;
 		map_len -= offset;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0fefb5455bda..391785de1428 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -35,6 +35,7 @@  struct vm_fault;
 #define IOMAP_F_NEW		0x01	/* blocks have been newly allocated */
 #define IOMAP_F_DIRTY		0x02	/* uncommitted metadata */
 #define IOMAP_F_BUFFER_HEAD	0x04	/* file system requires buffer heads */
+#define IOMAP_F_COW		0x08	/* cow before write */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
@@ -59,6 +60,8 @@  struct iomap {
 	u64			length;	/* length of mapping, bytes */
 	u16			type;	/* type of mapping */
 	u16			flags;	/* flags for mapping */
+	u64			cow_addr; /* read address to perform CoW */
+	loff_t			cow_pos; /* file offset of cow_addr */
 	struct block_device	*bdev;	/* block device for I/O */
 	struct dax_device	*dax_dev; /* dax_dev for dax operations */
 	void			*inline_data;