diff mbox series

[04/18] dax: Introduce IOMAP_DAX_COW to CoW edges during writes

Message ID 20190429172649.8288-5-rgoldwyn@suse.de (mailing list archive)
State New, archived
Headers show
Series btrfs dax support | expand

Commit Message

Goldwyn Rodrigues April 29, 2019, 5:26 p.m. UTC
From: Goldwyn Rodrigues <rgoldwyn@suse.com>

The IOMAP_DAX_COW is a iomap type which performs copy of
edges of data while performing a write if start/end are
not page aligned. The source address is expected in
iomap->inline_data.

dax_copy_edges() is a helper functions performs a copy from
one part of the device to another for data not page aligned.
If iomap->inline_data is NULL, it memset's the area to zero.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/dax.c              | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/iomap.h |  1 +
 2 files changed, 46 insertions(+), 1 deletion(-)

Comments

Darrick J. Wong May 21, 2019, 4:51 p.m. UTC | #1
On Mon, Apr 29, 2019 at 12:26:35PM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> 
> The IOMAP_DAX_COW is a iomap type which performs copy of
> edges of data while performing a write if start/end are
> not page aligned. The source address is expected in
> iomap->inline_data.
> 
> dax_copy_edges() is a helper functions performs a copy from
> one part of the device to another for data not page aligned.
> If iomap->inline_data is NULL, it memset's the area to zero.
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> ---
>  fs/dax.c              | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>  include/linux/iomap.h |  1 +
>  2 files changed, 46 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index e5e54da1715f..610bfa861a28 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1084,6 +1084,42 @@ int __dax_zero_page_range(struct block_device *bdev,
>  }
>  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
>  
> +/*
> + * dax_copy_edges - Copies the part of the pages not included in
> + * 		    the write, but required for CoW because
> + * 		    offset/offset+length are not page aligned.
> + */
> +static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
> +			   struct iomap *iomap, void *daddr)
> +{
> +	unsigned offset = pos & (PAGE_SIZE - 1);
> +	loff_t end = pos + length;
> +	loff_t pg_end = round_up(end, PAGE_SIZE);
> +	void *saddr = iomap->inline_data;
> +	int ret = 0;
> +	/*
> +	 * Copy the first part of the page
> +	 * Note: we pass offset as length
> +	 */
> +	if (offset) {
> +		if (saddr)
> +			ret = memcpy_mcsafe(daddr, saddr, offset);
> +		else
> +			memset(daddr, 0, offset);
> +	}
> +
> +	/* Copy the last part of the range */
> +	if (end < pg_end) {
> +		if (saddr)
> +			ret = memcpy_mcsafe(daddr + offset + length,
> +			       saddr + offset + length,	pg_end - end);
> +		else
> +			memset(daddr + offset + length, 0,
> +					pg_end - end);
> +	}
> +	return ret;
> +}
> +
>  static loff_t
>  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  		struct iomap *iomap)
> @@ -1105,9 +1141,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  			return iov_iter_zero(min(length, end - pos), iter);
>  	}
>  
> -	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> +	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED
> +			 && iomap->type != IOMAP_DAX_COW))

I reiterate (from V3) that the && goes on the previous line...

	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
			 iomap->type != IOMAP_DAX_COW))

>  		return -EIO;
>  
> +
>  	/*
>  	 * Write can allocate block for an area which has a hole page mapped
>  	 * into page tables. We have to tear down these mappings so that data
> @@ -1144,6 +1182,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>  			break;
>  		}
>  
> +		if (iomap->type == IOMAP_DAX_COW) {
> +			ret = dax_copy_edges(inode, pos, length, iomap, kaddr);
> +			if (ret)
> +				break;
> +		}
> +
>  		map_len = PFN_PHYS(map_len);
>  		kaddr += offset;
>  		map_len -= offset;
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index 0fefb5455bda..6e885c5a38a3 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -25,6 +25,7 @@ struct vm_fault;
>  #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
>  #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
>  #define IOMAP_INLINE	0x05	/* data inline in the inode */

> +#define IOMAP_DAX_COW	0x06

DAX isn't going to be the only scenario where we need a way to
communicate to iomap actors the need to implement copy on write.

XFS also uses struct iomap to hand out file leases to clients.  The
lease code /currently/ doesn't support files with shared blocks (because
the only user is pNFS) but one could easily imagine a future where some
client wants to lease a file with shared blocks, in which case XFS will
want to convey the COW details to the lessee.

> +/* Copy data pointed by inline_data before write*/

A month ago during the V3 patchset review, I wrote (possibly in an other
thread, sorry) about something that I'm putting my foot down about now
for the V4 patchset, which is the {re,ab}use of @inline_data for the
data source address.

We cannot use @inline_data to convey the source address.  @inline_data
(so far) is used to point to the in-memory representation of the storage
described by @addr.  For data writes, @addr is the location of the write
on disk and @inline_data is the location of the write in memory.

Reusing @inline_data here to point to the location of the source data in
memory is a totally different thing and will likely result in confusion.
On a practical level, this also means that we cannot support the case of
COW && INLINE because the type codes collide and so would the users of
@inline_data.  This isn't required *right now*, but if you had a pmem
filesystem that stages inode updates in memory and flips a pointer to
commit changes then the ->iomap_begin function will need to convey two
pointers at once.

So this brings us back to Dave's suggestion during the V1 patchset
review that instead of adding more iomap flags/types and overloading
fields, we simply pass two struct iomaps into ->iomap_begin:

 - Change iomap_apply() to "struct iomap iomap[2] = 0;" and pass
   &iomap[0] into the ->iomap_begin and ->iomap_end functions.  The
   first iomap will be filled in with the destination for the write (as
   all implementations do now), and the second iomap can be filled in
   with the source information for a COW operation.

 - If the ->iomap_begin implementation decides that COW is necessary for
   the requested operation, then it should fill out that second iomap
   with information about the extent that the actor must copied before
   returning.  The second iomap's offset and length must match the
   first.  If COW isn't necessary, the ->iomap_begin implementation
   ignores it, and the second iomap retains type == 0 (i.e. invalid
   mapping).

Proceeding along these lines will (AFAICT) still allow you to enable all
the btrfs functionality in the rest of this patchset while making the
task of wiring up XFS fairly simple.  No overloaded fields and no new
flags.

This is how I'd like to see this patchset should proceed to V5.  Does
that make sense?

--D

>  
>  /*
>   * Flags for all iomap mappings:
> -- 
> 2.16.4
>
Goldwyn Rodrigues May 22, 2019, 8:14 p.m. UTC | #2
On  9:51 21/05, Darrick J. Wong wrote:
> On Mon, Apr 29, 2019 at 12:26:35PM -0500, Goldwyn Rodrigues wrote:
> > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > 
> > The IOMAP_DAX_COW is a iomap type which performs copy of
> > edges of data while performing a write if start/end are
> > not page aligned. The source address is expected in
> > iomap->inline_data.
> > 
> > dax_copy_edges() is a helper functions performs a copy from
> > one part of the device to another for data not page aligned.
> > If iomap->inline_data is NULL, it memset's the area to zero.
> > 
> > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > ---
> >  fs/dax.c              | 46 +++++++++++++++++++++++++++++++++++++++++++++-
> >  include/linux/iomap.h |  1 +
> >  2 files changed, 46 insertions(+), 1 deletion(-)
> > 
> > diff --git a/fs/dax.c b/fs/dax.c
> > index e5e54da1715f..610bfa861a28 100644
> > --- a/fs/dax.c
> > +++ b/fs/dax.c
> > @@ -1084,6 +1084,42 @@ int __dax_zero_page_range(struct block_device *bdev,
> >  }
> >  EXPORT_SYMBOL_GPL(__dax_zero_page_range);
> >  
> > +/*
> > + * dax_copy_edges - Copies the part of the pages not included in
> > + * 		    the write, but required for CoW because
> > + * 		    offset/offset+length are not page aligned.
> > + */
> > +static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
> > +			   struct iomap *iomap, void *daddr)
> > +{
> > +	unsigned offset = pos & (PAGE_SIZE - 1);
> > +	loff_t end = pos + length;
> > +	loff_t pg_end = round_up(end, PAGE_SIZE);
> > +	void *saddr = iomap->inline_data;
> > +	int ret = 0;
> > +	/*
> > +	 * Copy the first part of the page
> > +	 * Note: we pass offset as length
> > +	 */
> > +	if (offset) {
> > +		if (saddr)
> > +			ret = memcpy_mcsafe(daddr, saddr, offset);
> > +		else
> > +			memset(daddr, 0, offset);
> > +	}
> > +
> > +	/* Copy the last part of the range */
> > +	if (end < pg_end) {
> > +		if (saddr)
> > +			ret = memcpy_mcsafe(daddr + offset + length,
> > +			       saddr + offset + length,	pg_end - end);
> > +		else
> > +			memset(daddr + offset + length, 0,
> > +					pg_end - end);
> > +	}
> > +	return ret;
> > +}
> > +
> >  static loff_t
> >  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> >  		struct iomap *iomap)
> > @@ -1105,9 +1141,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> >  			return iov_iter_zero(min(length, end - pos), iter);
> >  	}
> >  
> > -	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> > +	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED
> > +			 && iomap->type != IOMAP_DAX_COW))
> 
> I reiterate (from V3) that the && goes on the previous line...
> 
> 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
> 			 iomap->type != IOMAP_DAX_COW))
> 
> >  		return -EIO;
> >  
> > +
> >  	/*
> >  	 * Write can allocate block for an area which has a hole page mapped
> >  	 * into page tables. We have to tear down these mappings so that data
> > @@ -1144,6 +1182,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> >  			break;
> >  		}
> >  
> > +		if (iomap->type == IOMAP_DAX_COW) {
> > +			ret = dax_copy_edges(inode, pos, length, iomap, kaddr);
> > +			if (ret)
> > +				break;
> > +		}
> > +
> >  		map_len = PFN_PHYS(map_len);
> >  		kaddr += offset;
> >  		map_len -= offset;
> > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > index 0fefb5455bda..6e885c5a38a3 100644
> > --- a/include/linux/iomap.h
> > +++ b/include/linux/iomap.h
> > @@ -25,6 +25,7 @@ struct vm_fault;
> >  #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
> >  #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
> >  #define IOMAP_INLINE	0x05	/* data inline in the inode */
> 
> > +#define IOMAP_DAX_COW	0x06
> 
> DAX isn't going to be the only scenario where we need a way to
> communicate to iomap actors the need to implement copy on write.
> 
> XFS also uses struct iomap to hand out file leases to clients.  The
> lease code /currently/ doesn't support files with shared blocks (because
> the only user is pNFS) but one could easily imagine a future where some
> client wants to lease a file with shared blocks, in which case XFS will
> want to convey the COW details to the lessee.
> 
> > +/* Copy data pointed by inline_data before write*/
> 
> A month ago during the V3 patchset review, I wrote (possibly in an other
> thread, sorry) about something that I'm putting my foot down about now
> for the V4 patchset, which is the {re,ab}use of @inline_data for the
> data source address.

Looks like I missed this.
> 
> We cannot use @inline_data to convey the source address.  @inline_data
> (so far) is used to point to the in-memory representation of the storage
> described by @addr.  For data writes, @addr is the location of the write
> on disk and @inline_data is the location of the write in memory.
> 
> Reusing @inline_data here to point to the location of the source data in
> memory is a totally different thing and will likely result in confusion.
> On a practical level, this also means that we cannot support the case of
> COW && INLINE because the type codes collide and so would the users of
> @inline_data.  This isn't required *right now*, but if you had a pmem
> filesystem that stages inode updates in memory and flips a pointer to
> commit changes then the ->iomap_begin function will need to convey two
> pointers at once.
> 
> So this brings us back to Dave's suggestion during the V1 patchset
> review that instead of adding more iomap flags/types and overloading
> fields, we simply pass two struct iomaps into ->iomap_begin:

Actually, Dave is the one who suggested to perform it this way.
https://patchwork.kernel.org/comment/22562195/

> 
>  - Change iomap_apply() to "struct iomap iomap[2] = 0;" and pass
>    &iomap[0] into the ->iomap_begin and ->iomap_end functions.  The
>    first iomap will be filled in with the destination for the write (as
>    all implementations do now), and the second iomap can be filled in
>    with the source information for a COW operation.
> 
>  - If the ->iomap_begin implementation decides that COW is necessary for
>    the requested operation, then it should fill out that second iomap
>    with information about the extent that the actor must copied before
>    returning.  The second iomap's offset and length must match the
>    first.  If COW isn't necessary, the ->iomap_begin implementation
>    ignores it, and the second iomap retains type == 0 (i.e. invalid
>    mapping).
> 
> Proceeding along these lines will (AFAICT) still allow you to enable all
> the btrfs functionality in the rest of this patchset while making the
> task of wiring up XFS fairly simple.  No overloaded fields and no new
> flags.
> 
> This is how I'd like to see this patchset should proceed to V5.  Does
> that make sense?


Yes, I think this would be a more flexible design as well if we ever
decide to extend it beyond dax.
We would still need a IOMAP_COW type set in iomap[0].
Dave Chinner May 23, 2019, 2:10 a.m. UTC | #3
On Wed, May 22, 2019 at 03:14:47PM -0500, Goldwyn Rodrigues wrote:
> On  9:51 21/05, Darrick J. Wong wrote:
> > On Mon, Apr 29, 2019 at 12:26:35PM -0500, Goldwyn Rodrigues wrote:
> > We cannot use @inline_data to convey the source address.  @inline_data
> > (so far) is used to point to the in-memory representation of the storage
> > described by @addr.  For data writes, @addr is the location of the write
> > on disk and @inline_data is the location of the write in memory.
> > 
> > Reusing @inline_data here to point to the location of the source data in
> > memory is a totally different thing and will likely result in confusion.
> > On a practical level, this also means that we cannot support the case of
> > COW && INLINE because the type codes collide and so would the users of
> > @inline_data.  This isn't required *right now*, but if you had a pmem
> > filesystem that stages inode updates in memory and flips a pointer to
> > commit changes then the ->iomap_begin function will need to convey two
> > pointers at once.
> > 
> > So this brings us back to Dave's suggestion during the V1 patchset
> > review that instead of adding more iomap flags/types and overloading
> > fields, we simply pass two struct iomaps into ->iomap_begin:
> 
> Actually, Dave is the one who suggested to perform it this way.
> https://patchwork.kernel.org/comment/22562195/

My first suggestion was to use two iomaps. This suggestion came
later, as a way of demonstrating that a different type could be used
to redefine what ->inline_data was used for, if people considered
that an acceptible solution.

What was apparent from other discussions in the thread you quote was
that using two iomaps looked to be the better, more general approach
to solving the iomap read-modify-write issue at hand.

Cheers,

Dave.
Ruan Shiyang May 23, 2019, 9:05 a.m. UTC | #4
On 5/22/19 12:51 AM, Darrick J. Wong wrote:
> On Mon, Apr 29, 2019 at 12:26:35PM -0500, Goldwyn Rodrigues wrote:
>> From: Goldwyn Rodrigues <rgoldwyn@suse.com>
>>
>> The IOMAP_DAX_COW is a iomap type which performs copy of
>> edges of data while performing a write if start/end are
>> not page aligned. The source address is expected in
>> iomap->inline_data.
>>
>> dax_copy_edges() is a helper functions performs a copy from
>> one part of the device to another for data not page aligned.
>> If iomap->inline_data is NULL, it memset's the area to zero.
>>
>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
>> ---
>>   fs/dax.c              | 46 +++++++++++++++++++++++++++++++++++++++++++++-
>>   include/linux/iomap.h |  1 +
>>   2 files changed, 46 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/dax.c b/fs/dax.c
>> index e5e54da1715f..610bfa861a28 100644
>> --- a/fs/dax.c
>> +++ b/fs/dax.c
>> @@ -1084,6 +1084,42 @@ int __dax_zero_page_range(struct block_device *bdev,
>>   }
>>   EXPORT_SYMBOL_GPL(__dax_zero_page_range);
>>   
>> +/*
>> + * dax_copy_edges - Copies the part of the pages not included in
>> + * 		    the write, but required for CoW because
>> + * 		    offset/offset+length are not page aligned.
>> + */
>> +static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
>> +			   struct iomap *iomap, void *daddr)
>> +{
>> +	unsigned offset = pos & (PAGE_SIZE - 1);
>> +	loff_t end = pos + length;
>> +	loff_t pg_end = round_up(end, PAGE_SIZE);
>> +	void *saddr = iomap->inline_data;
>> +	int ret = 0;
>> +	/*
>> +	 * Copy the first part of the page
>> +	 * Note: we pass offset as length
>> +	 */
>> +	if (offset) {
>> +		if (saddr)
>> +			ret = memcpy_mcsafe(daddr, saddr, offset);
>> +		else
>> +			memset(daddr, 0, offset);
>> +	}
>> +
>> +	/* Copy the last part of the range */
>> +	if (end < pg_end) {
>> +		if (saddr)
>> +			ret = memcpy_mcsafe(daddr + offset + length,
>> +			       saddr + offset + length,	pg_end - end);
>> +		else
>> +			memset(daddr + offset + length, 0,
>> +					pg_end - end);
>> +	}
>> +	return ret;
>> +}
>> +
>>   static loff_t
>>   dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>>   		struct iomap *iomap)
>> @@ -1105,9 +1141,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>>   			return iov_iter_zero(min(length, end - pos), iter);
>>   	}
>>   
>> -	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
>> +	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED
>> +			 && iomap->type != IOMAP_DAX_COW))
> 
> I reiterate (from V3) that the && goes on the previous line...
> 
> 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
> 			 iomap->type != IOMAP_DAX_COW))
> 
>>   		return -EIO;
>>   
>> +
>>   	/*
>>   	 * Write can allocate block for an area which has a hole page mapped
>>   	 * into page tables. We have to tear down these mappings so that data
>> @@ -1144,6 +1182,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
>>   			break;
>>   		}
>>   
>> +		if (iomap->type == IOMAP_DAX_COW) {
>> +			ret = dax_copy_edges(inode, pos, length, iomap, kaddr);
>> +			if (ret)
>> +				break;
>> +		}
>> +
>>   		map_len = PFN_PHYS(map_len);
>>   		kaddr += offset;
>>   		map_len -= offset;
>> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
>> index 0fefb5455bda..6e885c5a38a3 100644
>> --- a/include/linux/iomap.h
>> +++ b/include/linux/iomap.h
>> @@ -25,6 +25,7 @@ struct vm_fault;
>>   #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
>>   #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
>>   #define IOMAP_INLINE	0x05	/* data inline in the inode */
> 
>> +#define IOMAP_DAX_COW	0x06
> 
> DAX isn't going to be the only scenario where we need a way to
> communicate to iomap actors the need to implement copy on write.
> 
> XFS also uses struct iomap to hand out file leases to clients.  The
> lease code /currently/ doesn't support files with shared blocks (because
> the only user is pNFS) but one could easily imagine a future where some
> client wants to lease a file with shared blocks, in which case XFS will
> want to convey the COW details to the lessee.
> 
>> +/* Copy data pointed by inline_data before write*/
> 
> A month ago during the V3 patchset review, I wrote (possibly in an other
> thread, sorry) about something that I'm putting my foot down about now
> for the V4 patchset, which is the {re,ab}use of @inline_data for the
> data source address.
> 
> We cannot use @inline_data to convey the source address.  @inline_data
> (so far) is used to point to the in-memory representation of the storage
> described by @addr.  For data writes, @addr is the location of the write
> on disk and @inline_data is the location of the write in memory.
> 
> Reusing @inline_data here to point to the location of the source data in
> memory is a totally different thing and will likely result in confusion.
> On a practical level, this also means that we cannot support the case of
> COW && INLINE because the type codes collide and so would the users of
> @inline_data.  This isn't required *right now*, but if you had a pmem
> filesystem that stages inode updates in memory and flips a pointer to
> commit changes then the ->iomap_begin function will need to convey two
> pointers at once.
> 
> So this brings us back to Dave's suggestion during the V1 patchset
> review that instead of adding more iomap flags/types and overloading
> fields, we simply pass two struct iomaps into ->iomap_begin:
> 
>   - Change iomap_apply() to "struct iomap iomap[2] = 0;" and pass
>     &iomap[0] into the ->iomap_begin and ->iomap_end functions.  The
>     first iomap will be filled in with the destination for the write (as
>     all implementations do now), and the second iomap can be filled in
>     with the source information for a COW operation.
> 
>   - If the ->iomap_begin implementation decides that COW is necessary for
>     the requested operation, then it should fill out that second iomap
>     with information about the extent that the actor must copied before
>     returning.  The second iomap's offset and length must match the
>     first.  If COW isn't necessary, the ->iomap_begin implementation

Hi,

I'm working on reflink & dax in XFS, here are some thoughts on this:

As mentioned above: the second iomap's offset and length must match the 
first.  I thought so at the beginning, but later found that the only 
difference between these two iomaps is @addr.  So, what about adding a 
@saddr, which means the source address of COW extent, into the struct 
iomap.  The ->iomap_begin() fills @saddr if the extent is COW, and 0 if 
not.  Then handle this @saddr in each ->actor().  No more modifications 
in other functions.

My RFC patchset[1] is implemented in this way and works for me, though 
it is far away from perfectness.

[1]: https://patchwork.kernel.org/cover/10904307/
Goldwyn Rodrigues May 23, 2019, 11:51 a.m. UTC | #5
On 17:05 23/05, Shiyang Ruan wrote:
> 
> 
> On 5/22/19 12:51 AM, Darrick J. Wong wrote:
> > On Mon, Apr 29, 2019 at 12:26:35PM -0500, Goldwyn Rodrigues wrote:
> > > From: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > 
> > > The IOMAP_DAX_COW is a iomap type which performs copy of
> > > edges of data while performing a write if start/end are
> > > not page aligned. The source address is expected in
> > > iomap->inline_data.
> > > 
> > > dax_copy_edges() is a helper functions performs a copy from
> > > one part of the device to another for data not page aligned.
> > > If iomap->inline_data is NULL, it memset's the area to zero.
> > > 
> > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
> > > ---
> > >   fs/dax.c              | 46 +++++++++++++++++++++++++++++++++++++++++++++-
> > >   include/linux/iomap.h |  1 +
> > >   2 files changed, 46 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/dax.c b/fs/dax.c
> > > index e5e54da1715f..610bfa861a28 100644
> > > --- a/fs/dax.c
> > > +++ b/fs/dax.c
> > > @@ -1084,6 +1084,42 @@ int __dax_zero_page_range(struct block_device *bdev,
> > >   }
> > >   EXPORT_SYMBOL_GPL(__dax_zero_page_range);
> > > +/*
> > > + * dax_copy_edges - Copies the part of the pages not included in
> > > + * 		    the write, but required for CoW because
> > > + * 		    offset/offset+length are not page aligned.
> > > + */
> > > +static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
> > > +			   struct iomap *iomap, void *daddr)
> > > +{
> > > +	unsigned offset = pos & (PAGE_SIZE - 1);
> > > +	loff_t end = pos + length;
> > > +	loff_t pg_end = round_up(end, PAGE_SIZE);
> > > +	void *saddr = iomap->inline_data;
> > > +	int ret = 0;
> > > +	/*
> > > +	 * Copy the first part of the page
> > > +	 * Note: we pass offset as length
> > > +	 */
> > > +	if (offset) {
> > > +		if (saddr)
> > > +			ret = memcpy_mcsafe(daddr, saddr, offset);
> > > +		else
> > > +			memset(daddr, 0, offset);
> > > +	}
> > > +
> > > +	/* Copy the last part of the range */
> > > +	if (end < pg_end) {
> > > +		if (saddr)
> > > +			ret = memcpy_mcsafe(daddr + offset + length,
> > > +			       saddr + offset + length,	pg_end - end);
> > > +		else
> > > +			memset(daddr + offset + length, 0,
> > > +					pg_end - end);
> > > +	}
> > > +	return ret;
> > > +}
> > > +
> > >   static loff_t
> > >   dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> > >   		struct iomap *iomap)
> > > @@ -1105,9 +1141,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> > >   			return iov_iter_zero(min(length, end - pos), iter);
> > >   	}
> > > -	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
> > > +	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED
> > > +			 && iomap->type != IOMAP_DAX_COW))
> > 
> > I reiterate (from V3) that the && goes on the previous line...
> > 
> > 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
> > 			 iomap->type != IOMAP_DAX_COW))
> > 
> > >   		return -EIO;
> > > +
> > >   	/*
> > >   	 * Write can allocate block for an area which has a hole page mapped
> > >   	 * into page tables. We have to tear down these mappings so that data
> > > @@ -1144,6 +1182,12 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
> > >   			break;
> > >   		}
> > > +		if (iomap->type == IOMAP_DAX_COW) {
> > > +			ret = dax_copy_edges(inode, pos, length, iomap, kaddr);
> > > +			if (ret)
> > > +				break;
> > > +		}
> > > +
> > >   		map_len = PFN_PHYS(map_len);
> > >   		kaddr += offset;
> > >   		map_len -= offset;
> > > diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> > > index 0fefb5455bda..6e885c5a38a3 100644
> > > --- a/include/linux/iomap.h
> > > +++ b/include/linux/iomap.h
> > > @@ -25,6 +25,7 @@ struct vm_fault;
> > >   #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
> > >   #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
> > >   #define IOMAP_INLINE	0x05	/* data inline in the inode */
> > 
> > > +#define IOMAP_DAX_COW	0x06
> > 
> > DAX isn't going to be the only scenario where we need a way to
> > communicate to iomap actors the need to implement copy on write.
> > 
> > XFS also uses struct iomap to hand out file leases to clients.  The
> > lease code /currently/ doesn't support files with shared blocks (because
> > the only user is pNFS) but one could easily imagine a future where some
> > client wants to lease a file with shared blocks, in which case XFS will
> > want to convey the COW details to the lessee.
> > 
> > > +/* Copy data pointed by inline_data before write*/
> > 
> > A month ago during the V3 patchset review, I wrote (possibly in an other
> > thread, sorry) about something that I'm putting my foot down about now
> > for the V4 patchset, which is the {re,ab}use of @inline_data for the
> > data source address.
> > 
> > We cannot use @inline_data to convey the source address.  @inline_data
> > (so far) is used to point to the in-memory representation of the storage
> > described by @addr.  For data writes, @addr is the location of the write
> > on disk and @inline_data is the location of the write in memory.
> > 
> > Reusing @inline_data here to point to the location of the source data in
> > memory is a totally different thing and will likely result in confusion.
> > On a practical level, this also means that we cannot support the case of
> > COW && INLINE because the type codes collide and so would the users of
> > @inline_data.  This isn't required *right now*, but if you had a pmem
> > filesystem that stages inode updates in memory and flips a pointer to
> > commit changes then the ->iomap_begin function will need to convey two
> > pointers at once.
> > 
> > So this brings us back to Dave's suggestion during the V1 patchset
> > review that instead of adding more iomap flags/types and overloading
> > fields, we simply pass two struct iomaps into ->iomap_begin:
> > 
> >   - Change iomap_apply() to "struct iomap iomap[2] = 0;" and pass
> >     &iomap[0] into the ->iomap_begin and ->iomap_end functions.  The
> >     first iomap will be filled in with the destination for the write (as
> >     all implementations do now), and the second iomap can be filled in
> >     with the source information for a COW operation.
> > 
> >   - If the ->iomap_begin implementation decides that COW is necessary for
> >     the requested operation, then it should fill out that second iomap
> >     with information about the extent that the actor must copied before
> >     returning.  The second iomap's offset and length must match the
> >     first.  If COW isn't necessary, the ->iomap_begin implementation
> 
> Hi,
> 
> I'm working on reflink & dax in XFS, here are some thoughts on this:
> 
> As mentioned above: the second iomap's offset and length must match the
> first.  I thought so at the beginning, but later found that the only
> difference between these two iomaps is @addr.  So, what about adding a
> @saddr, which means the source address of COW extent, into the struct iomap.
> The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
> handle this @saddr in each ->actor().  No more modifications in other
> functions.

Yes, I started of with the exact idea before being recommended this by Dave.
I used two fields instead of one namely cow_pos and cow_addr which defined
the source details. I had put it as a iomap flag as opposed to a type
which of course did not appeal well.

We may want to use iomaps for cases where two inodes are involved.
An example of the other scenario where offset may be different is file
comparison for dedup: vfs_dedup_file_range_compare(). However, it would
need two inodes in iomap as well.

> 
> My RFC patchset[1] is implemented in this way and works for me, though it is
> far away from perfectness.
> 
> [1]: https://patchwork.kernel.org/cover/10904307/
>
Ruan Shiyang May 27, 2019, 8:25 a.m. UTC | #6
On 5/23/19 7:51 PM, Goldwyn Rodrigues wrote:
>>
>> Hi,
>>
>> I'm working on reflink & dax in XFS, here are some thoughts on this:
>>
>> As mentioned above: the second iomap's offset and length must match the
>> first.  I thought so at the beginning, but later found that the only
>> difference between these two iomaps is @addr.  So, what about adding a
>> @saddr, which means the source address of COW extent, into the struct iomap.
>> The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
>> handle this @saddr in each ->actor().  No more modifications in other
>> functions.
> 
> Yes, I started of with the exact idea before being recommended this by Dave.
> I used two fields instead of one namely cow_pos and cow_addr which defined
> the source details. I had put it as a iomap flag as opposed to a type
> which of course did not appeal well.
> 
> We may want to use iomaps for cases where two inodes are involved.
> An example of the other scenario where offset may be different is file
> comparison for dedup: vfs_dedup_file_range_compare(). However, it would
> need two inodes in iomap as well.
> 
Yes, it is reasonable.  Thanks for your explanation.

One more thing RFC:
I'd like to add an end-io callback argument in ->dax_iomap_actor() to 
update the metadata after one whole COW operation is completed.  The 
end-io can also be called in ->iomap_end().  But one COW operation may 
call ->iomap_apply() many times, and so does the end-io.  Thus, I think 
it would be nice to move it to the bottom of ->dax_iomap_actor(), called 
just once in each COW operation.
Jan Kara May 28, 2019, 9:17 a.m. UTC | #7
On Mon 27-05-19 16:25:41, Shiyang Ruan wrote:
> On 5/23/19 7:51 PM, Goldwyn Rodrigues wrote:
> > > 
> > > Hi,
> > > 
> > > I'm working on reflink & dax in XFS, here are some thoughts on this:
> > > 
> > > As mentioned above: the second iomap's offset and length must match the
> > > first.  I thought so at the beginning, but later found that the only
> > > difference between these two iomaps is @addr.  So, what about adding a
> > > @saddr, which means the source address of COW extent, into the struct iomap.
> > > The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
> > > handle this @saddr in each ->actor().  No more modifications in other
> > > functions.
> > 
> > Yes, I started of with the exact idea before being recommended this by Dave.
> > I used two fields instead of one namely cow_pos and cow_addr which defined
> > the source details. I had put it as a iomap flag as opposed to a type
> > which of course did not appeal well.
> > 
> > We may want to use iomaps for cases where two inodes are involved.
> > An example of the other scenario where offset may be different is file
> > comparison for dedup: vfs_dedup_file_range_compare(). However, it would
> > need two inodes in iomap as well.
> > 
> Yes, it is reasonable.  Thanks for your explanation.
> 
> One more thing RFC:
> I'd like to add an end-io callback argument in ->dax_iomap_actor() to update
> the metadata after one whole COW operation is completed.  The end-io can
> also be called in ->iomap_end().  But one COW operation may call
> ->iomap_apply() many times, and so does the end-io.  Thus, I think it would
> be nice to move it to the bottom of ->dax_iomap_actor(), called just once in
> each COW operation.

I'm sorry but I don't follow what you suggest. One COW operation is a call
to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
->iomap_end() once. So I don't see a difference between doing something in
->actor() and ->iomap_end() (besides the passed arguments but that does not
seem to be your concern). So what do you exactly want to do?

								Honza
Ruan Shiyang May 29, 2019, 2:01 a.m. UTC | #8
On 5/28/19 5:17 PM, Jan Kara wrote:
> On Mon 27-05-19 16:25:41, Shiyang Ruan wrote:
>> On 5/23/19 7:51 PM, Goldwyn Rodrigues wrote:
>>>>
>>>> Hi,
>>>>
>>>> I'm working on reflink & dax in XFS, here are some thoughts on this:
>>>>
>>>> As mentioned above: the second iomap's offset and length must match the
>>>> first.  I thought so at the beginning, but later found that the only
>>>> difference between these two iomaps is @addr.  So, what about adding a
>>>> @saddr, which means the source address of COW extent, into the struct iomap.
>>>> The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
>>>> handle this @saddr in each ->actor().  No more modifications in other
>>>> functions.
>>>
>>> Yes, I started of with the exact idea before being recommended this by Dave.
>>> I used two fields instead of one namely cow_pos and cow_addr which defined
>>> the source details. I had put it as a iomap flag as opposed to a type
>>> which of course did not appeal well.
>>>
>>> We may want to use iomaps for cases where two inodes are involved.
>>> An example of the other scenario where offset may be different is file
>>> comparison for dedup: vfs_dedup_file_range_compare(). However, it would
>>> need two inodes in iomap as well.
>>>
>> Yes, it is reasonable.  Thanks for your explanation.
>>
>> One more thing RFC:
>> I'd like to add an end-io callback argument in ->dax_iomap_actor() to update
>> the metadata after one whole COW operation is completed.  The end-io can
>> also be called in ->iomap_end().  But one COW operation may call
>> ->iomap_apply() many times, and so does the end-io.  Thus, I think it would
>> be nice to move it to the bottom of ->dax_iomap_actor(), called just once in
>> each COW operation.
> 
> I'm sorry but I don't follow what you suggest. One COW operation is a call
> to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
> each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
> ->iomap_end() once. So I don't see a difference between doing something in
> ->actor() and ->iomap_end() (besides the passed arguments but that does not
> seem to be your concern). So what do you exactly want to do?

Hi Jan,

Thanks for pointing out, and I'm sorry for my mistake.  It's 
->dax_iomap_rw(), not ->dax_iomap_actor().

I want to call the callback function at the end of ->dax_iomap_rw().

Like this:
dax_iomap_rw(..., callback) {

     ...
     while (...) {
         iomap_apply(...);
     }

     if (callback != null) {
         callback();
     }
     return ...;
}

> 
> 								Honza
>
Dave Chinner May 29, 2019, 2:47 a.m. UTC | #9
On Wed, May 29, 2019 at 10:01:58AM +0800, Shiyang Ruan wrote:
> 
> On 5/28/19 5:17 PM, Jan Kara wrote:
> > On Mon 27-05-19 16:25:41, Shiyang Ruan wrote:
> > > On 5/23/19 7:51 PM, Goldwyn Rodrigues wrote:
> > > > > 
> > > > > Hi,
> > > > > 
> > > > > I'm working on reflink & dax in XFS, here are some thoughts on this:
> > > > > 
> > > > > As mentioned above: the second iomap's offset and length must match the
> > > > > first.  I thought so at the beginning, but later found that the only
> > > > > difference between these two iomaps is @addr.  So, what about adding a
> > > > > @saddr, which means the source address of COW extent, into the struct iomap.
> > > > > The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
> > > > > handle this @saddr in each ->actor().  No more modifications in other
> > > > > functions.
> > > > 
> > > > Yes, I started of with the exact idea before being recommended this by Dave.
> > > > I used two fields instead of one namely cow_pos and cow_addr which defined
> > > > the source details. I had put it as a iomap flag as opposed to a type
> > > > which of course did not appeal well.
> > > > 
> > > > We may want to use iomaps for cases where two inodes are involved.
> > > > An example of the other scenario where offset may be different is file
> > > > comparison for dedup: vfs_dedup_file_range_compare(). However, it would
> > > > need two inodes in iomap as well.
> > > > 
> > > Yes, it is reasonable.  Thanks for your explanation.
> > > 
> > > One more thing RFC:
> > > I'd like to add an end-io callback argument in ->dax_iomap_actor() to update
> > > the metadata after one whole COW operation is completed.  The end-io can
> > > also be called in ->iomap_end().  But one COW operation may call
> > > ->iomap_apply() many times, and so does the end-io.  Thus, I think it would
> > > be nice to move it to the bottom of ->dax_iomap_actor(), called just once in
> > > each COW operation.
> > 
> > I'm sorry but I don't follow what you suggest. One COW operation is a call
> > to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
> > each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
> > ->iomap_end() once. So I don't see a difference between doing something in
> > ->actor() and ->iomap_end() (besides the passed arguments but that does not
> > seem to be your concern). So what do you exactly want to do?
> 
> Hi Jan,
> 
> Thanks for pointing out, and I'm sorry for my mistake.  It's
> ->dax_iomap_rw(), not ->dax_iomap_actor().
> 
> I want to call the callback function at the end of ->dax_iomap_rw().
> 
> Like this:
> dax_iomap_rw(..., callback) {
> 
>     ...
>     while (...) {
>         iomap_apply(...);
>     }
> 
>     if (callback != null) {
>         callback();
>     }
>     return ...;
> }

Why does this need to be in dax_iomap_rw()?

We already do post-dax_iomap_rw() "io-end callbacks" directly in
xfs_file_dax_write() to update the file size....

Cheers,

Dave.
Ruan Shiyang May 29, 2019, 4:02 a.m. UTC | #10
On 5/29/19 10:47 AM, Dave Chinner wrote:
> On Wed, May 29, 2019 at 10:01:58AM +0800, Shiyang Ruan wrote:
>>
>> On 5/28/19 5:17 PM, Jan Kara wrote:
>>> On Mon 27-05-19 16:25:41, Shiyang Ruan wrote:
>>>> On 5/23/19 7:51 PM, Goldwyn Rodrigues wrote:
>>>>>>
>>>>>> Hi,
>>>>>>
>>>>>> I'm working on reflink & dax in XFS, here are some thoughts on this:
>>>>>>
>>>>>> As mentioned above: the second iomap's offset and length must match the
>>>>>> first.  I thought so at the beginning, but later found that the only
>>>>>> difference between these two iomaps is @addr.  So, what about adding a
>>>>>> @saddr, which means the source address of COW extent, into the struct iomap.
>>>>>> The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
>>>>>> handle this @saddr in each ->actor().  No more modifications in other
>>>>>> functions.
>>>>>
>>>>> Yes, I started of with the exact idea before being recommended this by Dave.
>>>>> I used two fields instead of one namely cow_pos and cow_addr which defined
>>>>> the source details. I had put it as a iomap flag as opposed to a type
>>>>> which of course did not appeal well.
>>>>>
>>>>> We may want to use iomaps for cases where two inodes are involved.
>>>>> An example of the other scenario where offset may be different is file
>>>>> comparison for dedup: vfs_dedup_file_range_compare(). However, it would
>>>>> need two inodes in iomap as well.
>>>>>
>>>> Yes, it is reasonable.  Thanks for your explanation.
>>>>
>>>> One more thing RFC:
>>>> I'd like to add an end-io callback argument in ->dax_iomap_actor() to update
>>>> the metadata after one whole COW operation is completed.  The end-io can
>>>> also be called in ->iomap_end().  But one COW operation may call
>>>> ->iomap_apply() many times, and so does the end-io.  Thus, I think it would
>>>> be nice to move it to the bottom of ->dax_iomap_actor(), called just once in
>>>> each COW operation.
>>>
>>> I'm sorry but I don't follow what you suggest. One COW operation is a call
>>> to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
>>> each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
>>> ->iomap_end() once. So I don't see a difference between doing something in
>>> ->actor() and ->iomap_end() (besides the passed arguments but that does not
>>> seem to be your concern). So what do you exactly want to do?
>>
>> Hi Jan,
>>
>> Thanks for pointing out, and I'm sorry for my mistake.  It's
>> ->dax_iomap_rw(), not ->dax_iomap_actor().
>>
>> I want to call the callback function at the end of ->dax_iomap_rw().
>>
>> Like this:
>> dax_iomap_rw(..., callback) {
>>
>>      ...
>>      while (...) {
>>          iomap_apply(...);
>>      }
>>
>>      if (callback != null) {
>>          callback();
>>      }
>>      return ...;
>> }
> 
> Why does this need to be in dax_iomap_rw()?
> 
> We already do post-dax_iomap_rw() "io-end callbacks" directly in
> xfs_file_dax_write() to update the file size....

Yes, but we also need to call ->xfs_reflink_end_cow() after a COW 
operation.  And an is-cow flag(from iomap) is also needed to determine 
if we call it.  I think it would be better to put this into 
->dax_iomap_rw() as a callback function.

So sorry for my poor expression.

> 
> Cheers,
> 
> Dave.
>
Darrick J. Wong May 29, 2019, 4:07 a.m. UTC | #11
On Wed, May 29, 2019 at 12:02:40PM +0800, Shiyang Ruan wrote:
> 
> 
> On 5/29/19 10:47 AM, Dave Chinner wrote:
> > On Wed, May 29, 2019 at 10:01:58AM +0800, Shiyang Ruan wrote:
> > > 
> > > On 5/28/19 5:17 PM, Jan Kara wrote:
> > > > On Mon 27-05-19 16:25:41, Shiyang Ruan wrote:
> > > > > On 5/23/19 7:51 PM, Goldwyn Rodrigues wrote:
> > > > > > > 
> > > > > > > Hi,
> > > > > > > 
> > > > > > > I'm working on reflink & dax in XFS, here are some thoughts on this:
> > > > > > > 
> > > > > > > As mentioned above: the second iomap's offset and length must match the
> > > > > > > first.  I thought so at the beginning, but later found that the only
> > > > > > > difference between these two iomaps is @addr.  So, what about adding a
> > > > > > > @saddr, which means the source address of COW extent, into the struct iomap.
> > > > > > > The ->iomap_begin() fills @saddr if the extent is COW, and 0 if not.  Then
> > > > > > > handle this @saddr in each ->actor().  No more modifications in other
> > > > > > > functions.
> > > > > > 
> > > > > > Yes, I started of with the exact idea before being recommended this by Dave.
> > > > > > I used two fields instead of one namely cow_pos and cow_addr which defined
> > > > > > the source details. I had put it as a iomap flag as opposed to a type
> > > > > > which of course did not appeal well.
> > > > > > 
> > > > > > We may want to use iomaps for cases where two inodes are involved.
> > > > > > An example of the other scenario where offset may be different is file
> > > > > > comparison for dedup: vfs_dedup_file_range_compare(). However, it would
> > > > > > need two inodes in iomap as well.
> > > > > > 
> > > > > Yes, it is reasonable.  Thanks for your explanation.
> > > > > 
> > > > > One more thing RFC:
> > > > > I'd like to add an end-io callback argument in ->dax_iomap_actor() to update
> > > > > the metadata after one whole COW operation is completed.  The end-io can
> > > > > also be called in ->iomap_end().  But one COW operation may call
> > > > > ->iomap_apply() many times, and so does the end-io.  Thus, I think it would
> > > > > be nice to move it to the bottom of ->dax_iomap_actor(), called just once in
> > > > > each COW operation.
> > > > 
> > > > I'm sorry but I don't follow what you suggest. One COW operation is a call
> > > > to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
> > > > each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
> > > > ->iomap_end() once. So I don't see a difference between doing something in
> > > > ->actor() and ->iomap_end() (besides the passed arguments but that does not
> > > > seem to be your concern). So what do you exactly want to do?
> > > 
> > > Hi Jan,
> > > 
> > > Thanks for pointing out, and I'm sorry for my mistake.  It's
> > > ->dax_iomap_rw(), not ->dax_iomap_actor().
> > > 
> > > I want to call the callback function at the end of ->dax_iomap_rw().
> > > 
> > > Like this:
> > > dax_iomap_rw(..., callback) {
> > > 
> > >      ...
> > >      while (...) {
> > >          iomap_apply(...);
> > >      }
> > > 
> > >      if (callback != null) {
> > >          callback();
> > >      }
> > >      return ...;
> > > }
> > 
> > Why does this need to be in dax_iomap_rw()?
> > 
> > We already do post-dax_iomap_rw() "io-end callbacks" directly in
> > xfs_file_dax_write() to update the file size....
> 
> Yes, but we also need to call ->xfs_reflink_end_cow() after a COW operation.
> And an is-cow flag(from iomap) is also needed to determine if we call it.  I
> think it would be better to put this into ->dax_iomap_rw() as a callback
> function.

Sort of like how iomap_dio_rw takes a write endio function?

--D

> So sorry for my poor expression.
> 
> > 
> > Cheers,
> > 
> > Dave.
> > 
> 
> -- 
> Thanks,
> Shiyang Ruan.
> 
>
Dave Chinner May 29, 2019, 4:46 a.m. UTC | #12
On Tue, May 28, 2019 at 09:07:19PM -0700, Darrick J. Wong wrote:
> On Wed, May 29, 2019 at 12:02:40PM +0800, Shiyang Ruan wrote:
> > On 5/29/19 10:47 AM, Dave Chinner wrote:
> > > On Wed, May 29, 2019 at 10:01:58AM +0800, Shiyang Ruan wrote:
> > > > On 5/28/19 5:17 PM, Jan Kara wrote:
> > > > > I'm sorry but I don't follow what you suggest. One COW operation is a call
> > > > > to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
> > > > > each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
> > > > > ->iomap_end() once. So I don't see a difference between doing something in
> > > > > ->actor() and ->iomap_end() (besides the passed arguments but that does not
> > > > > seem to be your concern). So what do you exactly want to do?
> > > > 
> > > > Hi Jan,
> > > > 
> > > > Thanks for pointing out, and I'm sorry for my mistake.  It's
> > > > ->dax_iomap_rw(), not ->dax_iomap_actor().
> > > > 
> > > > I want to call the callback function at the end of ->dax_iomap_rw().
> > > > 
> > > > Like this:
> > > > dax_iomap_rw(..., callback) {
> > > > 
> > > >      ...
> > > >      while (...) {
> > > >          iomap_apply(...);
> > > >      }
> > > > 
> > > >      if (callback != null) {
> > > >          callback();
> > > >      }
> > > >      return ...;
> > > > }
> > > 
> > > Why does this need to be in dax_iomap_rw()?
> > > 
> > > We already do post-dax_iomap_rw() "io-end callbacks" directly in
> > > xfs_file_dax_write() to update the file size....
> > 
> > Yes, but we also need to call ->xfs_reflink_end_cow() after a COW operation.
> > And an is-cow flag(from iomap) is also needed to determine if we call it.  I
> > think it would be better to put this into ->dax_iomap_rw() as a callback
> > function.
> 
> Sort of like how iomap_dio_rw takes a write endio function?

You mean like we originally had in the DAX code for unwritten
extents?

But we got rid of that because performance of unwritten extents was
absolutely woeful - it's cheaper in terms of CPU cost to do up front
zeroing (i.e. inside ->iomap_begin) than it is to use unwritten
extents and convert them to protect against stale data exposure.

I have a feeling that exactly the same thing is true for CoW - the
hoops we jump through to do COW fork manipulation and then extent
movement between the COW fork and the data fork on IO completion
would be better done before we commit the COW extent allocation.

In which case, what we actually want for DAX is:


 iomap_apply()

 	->iomap_begin()
		map old data extent that we copy from

		allocate new data extent we copy to in data fork,
		immediately replacing old data extent

		return transaction handle as private data

	dax_iomap_actor()
		copies data from old extent to new extent

	->iomap_end
		commits transaction now data has been copied, making
		the COW operation atomic with the data copy.


This, in fact, should be how we do all DAX writes that require
allocation, because then we get rid of the need to zero newly
allocated or unwritten extents before we copy the data into it. i.e.
we only need to write once to newly allocated storage rather than
twice.

This gets rid of the need for COW callbacks, and means the DAX
reflink implementation does not need to use delalloc
speculative preallocation or COW forks at all.

Cheers,

Dave.
Jan Kara May 29, 2019, 1:46 p.m. UTC | #13
On Wed 29-05-19 14:46:58, Dave Chinner wrote:
> On Tue, May 28, 2019 at 09:07:19PM -0700, Darrick J. Wong wrote:
> > On Wed, May 29, 2019 at 12:02:40PM +0800, Shiyang Ruan wrote:
> > > On 5/29/19 10:47 AM, Dave Chinner wrote:
> > > > On Wed, May 29, 2019 at 10:01:58AM +0800, Shiyang Ruan wrote:
> > > > > On 5/28/19 5:17 PM, Jan Kara wrote:
> > > > > > I'm sorry but I don't follow what you suggest. One COW operation is a call
> > > > > > to dax_iomap_rw(), isn't it? That may call iomap_apply() several times,
> > > > > > each invocation calls ->iomap_begin(), ->actor() (dax_iomap_actor()),
> > > > > > ->iomap_end() once. So I don't see a difference between doing something in
> > > > > > ->actor() and ->iomap_end() (besides the passed arguments but that does not
> > > > > > seem to be your concern). So what do you exactly want to do?
> > > > > 
> > > > > Hi Jan,
> > > > > 
> > > > > Thanks for pointing out, and I'm sorry for my mistake.  It's
> > > > > ->dax_iomap_rw(), not ->dax_iomap_actor().
> > > > > 
> > > > > I want to call the callback function at the end of ->dax_iomap_rw().
> > > > > 
> > > > > Like this:
> > > > > dax_iomap_rw(..., callback) {
> > > > > 
> > > > >      ...
> > > > >      while (...) {
> > > > >          iomap_apply(...);
> > > > >      }
> > > > > 
> > > > >      if (callback != null) {
> > > > >          callback();
> > > > >      }
> > > > >      return ...;
> > > > > }
> > > > 
> > > > Why does this need to be in dax_iomap_rw()?
> > > > 
> > > > We already do post-dax_iomap_rw() "io-end callbacks" directly in
> > > > xfs_file_dax_write() to update the file size....
> > > 
> > > Yes, but we also need to call ->xfs_reflink_end_cow() after a COW operation.
> > > And an is-cow flag(from iomap) is also needed to determine if we call it.  I
> > > think it would be better to put this into ->dax_iomap_rw() as a callback
> > > function.
> > 
> > Sort of like how iomap_dio_rw takes a write endio function?
> 
> You mean like we originally had in the DAX code for unwritten
> extents?
> 
> But we got rid of that because performance of unwritten extents was
> absolutely woeful - it's cheaper in terms of CPU cost to do up front
> zeroing (i.e. inside ->iomap_begin) than it is to use unwritten
> extents and convert them to protect against stale data exposure.
> 
> I have a feeling that exactly the same thing is true for CoW - the
> hoops we jump through to do COW fork manipulation and then extent
> movement between the COW fork and the data fork on IO completion
> would be better done before we commit the COW extent allocation.
> 
> In which case, what we actually want for DAX is:
> 
> 
>  iomap_apply()
> 
>  	->iomap_begin()
> 		map old data extent that we copy from
> 
> 		allocate new data extent we copy to in data fork,
> 		immediately replacing old data extent
> 
> 		return transaction handle as private data
> 
> 	dax_iomap_actor()
> 		copies data from old extent to new extent
> 
> 	->iomap_end
> 		commits transaction now data has been copied, making
> 		the COW operation atomic with the data copy.
> 
> 
> This, in fact, should be how we do all DAX writes that require
> allocation, because then we get rid of the need to zero newly
> allocated or unwritten extents before we copy the data into it. i.e.
> we only need to write once to newly allocated storage rather than
> twice.

You need to be careful though. You need to synchronize with page faults so
that they cannot see and expose in page tables blocks you've allocated
before their contents is filled. This race was actually the strongest
motivation for pre-zeroing of blocks. OTOH copy_from_iter() in
dax_iomap_actor() needs to be able to fault pages to copy from (and these
pages may be from the same file you're writing to) so you cannot just block
faulting for the file through I_MMAP_LOCK.

								Honza
Dave Chinner May 29, 2019, 10:14 p.m. UTC | #14
On Wed, May 29, 2019 at 03:46:29PM +0200, Jan Kara wrote:
> On Wed 29-05-19 14:46:58, Dave Chinner wrote:
> >  iomap_apply()
> > 
> >  	->iomap_begin()
> > 		map old data extent that we copy from
> > 
> > 		allocate new data extent we copy to in data fork,
> > 		immediately replacing old data extent
> > 
> > 		return transaction handle as private data

This holds the inode block map locked exclusively across the IO,
so....

> > 
> > 	dax_iomap_actor()
> > 		copies data from old extent to new extent
> > 
> > 	->iomap_end
> > 		commits transaction now data has been copied, making
> > 		the COW operation atomic with the data copy.
> > 
> > 
> > This, in fact, should be how we do all DAX writes that require
> > allocation, because then we get rid of the need to zero newly
> > allocated or unwritten extents before we copy the data into it. i.e.
> > we only need to write once to newly allocated storage rather than
> > twice.
> 
> You need to be careful though. You need to synchronize with page faults so
> that they cannot see and expose in page tables blocks you've allocated
> before their contents is filled.

... so the page fault will block trying to map the blocks because
it can't get the xfs_inode->i_ilock until the allocation transaciton
commits....

> This race was actually the strongest
> motivation for pre-zeroing of blocks. OTOH copy_from_iter() in
> dax_iomap_actor() needs to be able to fault pages to copy from (and these
> pages may be from the same file you're writing to) so you cannot just block
> faulting for the file through I_MMAP_LOCK.

Right, it doesn't take the I_MMAP_LOCK, but it would block further
in. And, really, I'm not caring all this much about this corner
case. i.e.  anyone using a "mmap()+write() zero copy" pattern on DAX
within a file is unbeleivably naive - the data still gets copied by
the CPU in the write() call. It's far simpler and more effcient to
just mmap() both ranges of the file(s) and memcpy() in userspace....

FWIW, it's to avoid problems with stupid userspace stuff that nobody
really should be doing that I want range locks for the XFS inode
locks.  If userspace overlaps the ranges and deadlocks in that case,
they they get to keep all the broken bits because, IMO, they are
doing something monumentally stupid. I'd probably be making it
return EDEADLOCK back out to userspace in the case rather than
deadlocking but, fundamentally, I think it's broken behaviour that
we should be rejecting with an error rather than adding complexity
trying to handle it.

Cheers,

Dave.
Jan Kara May 30, 2019, 11:16 a.m. UTC | #15
On Thu 30-05-19 08:14:45, Dave Chinner wrote:
> On Wed, May 29, 2019 at 03:46:29PM +0200, Jan Kara wrote:
> > On Wed 29-05-19 14:46:58, Dave Chinner wrote:
> > >  iomap_apply()
> > > 
> > >  	->iomap_begin()
> > > 		map old data extent that we copy from
> > > 
> > > 		allocate new data extent we copy to in data fork,
> > > 		immediately replacing old data extent
> > > 
> > > 		return transaction handle as private data
> 
> This holds the inode block map locked exclusively across the IO,
> so....

Does it? We do hold XFS_IOLOCK_EXCL during the whole dax write. But
xfs_file_iomap_begin() does release XFS_ILOCK_* on exit AFAICS. So I don't
see anything that would prevent page fault from mapping blocks into page
tables just after xfs_file_iomap_begin() returns.

> > > 	dax_iomap_actor()
> > > 		copies data from old extent to new extent
> > > 
> > > 	->iomap_end
> > > 		commits transaction now data has been copied, making
> > > 		the COW operation atomic with the data copy.
> > > 
> > > 
> > > This, in fact, should be how we do all DAX writes that require
> > > allocation, because then we get rid of the need to zero newly
> > > allocated or unwritten extents before we copy the data into it. i.e.
> > > we only need to write once to newly allocated storage rather than
> > > twice.
> > 
> > You need to be careful though. You need to synchronize with page faults so
> > that they cannot see and expose in page tables blocks you've allocated
> > before their contents is filled.
> 
> ... so the page fault will block trying to map the blocks because
> it can't get the xfs_inode->i_ilock until the allocation transaciton
> commits....
> 
> > This race was actually the strongest
> > motivation for pre-zeroing of blocks. OTOH copy_from_iter() in
> > dax_iomap_actor() needs to be able to fault pages to copy from (and these
> > pages may be from the same file you're writing to) so you cannot just block
> > faulting for the file through I_MMAP_LOCK.
> 
> Right, it doesn't take the I_MMAP_LOCK, but it would block further
> in. And, really, I'm not caring all this much about this corner
> case. i.e.  anyone using a "mmap()+write() zero copy" pattern on DAX
> within a file is unbeleivably naive - the data still gets copied by
> the CPU in the write() call. It's far simpler and more effcient to
> just mmap() both ranges of the file(s) and memcpy() in userspace....
> 
> FWIW, it's to avoid problems with stupid userspace stuff that nobody
> really should be doing that I want range locks for the XFS inode
> locks.  If userspace overlaps the ranges and deadlocks in that case,
> they they get to keep all the broken bits because, IMO, they are
> doing something monumentally stupid. I'd probably be making it
> return EDEADLOCK back out to userspace in the case rather than
> deadlocking but, fundamentally, I think it's broken behaviour that
> we should be rejecting with an error rather than adding complexity
> trying to handle it.

I agree with this. We must just prevent user from taking the kernel down
with maliciously created IOs...

								Honza
Dave Chinner May 30, 2019, 10:59 p.m. UTC | #16
On Thu, May 30, 2019 at 01:16:05PM +0200, Jan Kara wrote:
> On Thu 30-05-19 08:14:45, Dave Chinner wrote:
> > On Wed, May 29, 2019 at 03:46:29PM +0200, Jan Kara wrote:
> > > On Wed 29-05-19 14:46:58, Dave Chinner wrote:
> > > >  iomap_apply()
> > > > 
> > > >  	->iomap_begin()
> > > > 		map old data extent that we copy from
> > > > 
> > > > 		allocate new data extent we copy to in data fork,
> > > > 		immediately replacing old data extent
> > > > 
> > > > 		return transaction handle as private data
> > 
> > This holds the inode block map locked exclusively across the IO,
> > so....
> 
> Does it? We do hold XFS_IOLOCK_EXCL during the whole dax write.

I forgot about that, I keep thinking that we use shared locking for
DAX like we do for direct IO. There's another reason for range
locks - allowing concurrent DAX read/write IO - but that's
orthogonal to the issue here.

> But
> xfs_file_iomap_begin() does release XFS_ILOCK_* on exit AFAICS. So I don't
> see anything that would prevent page fault from mapping blocks into page
> tables just after xfs_file_iomap_begin() returns.

Right, holding the IOLOCK doesn't stop concurrent page faults from
mapping the page we are trying to write, and that leaves a window
where stale data can be exposed if we don't initialise the newly
allocated range whilst in the allocation transaction holding the
ILOCK. That's what the XFS_BMAPI_ZERO flag does in the DAX block
allocation path.

So the idea of holding the allocation transaction across the data
copy means that ILOCK is then held until the data blocks are fully
initialised with valid data, meaning we can greatly reduce the scope
of the XFS_BMAPI_ZERO flag and possible get rid of it altogether.

> > > This race was actually the strongest
> > > motivation for pre-zeroing of blocks. OTOH copy_from_iter() in
> > > dax_iomap_actor() needs to be able to fault pages to copy from (and these
> > > pages may be from the same file you're writing to) so you cannot just block
> > > faulting for the file through I_MMAP_LOCK.
> > 
> > Right, it doesn't take the I_MMAP_LOCK, but it would block further
> > in. And, really, I'm not caring all this much about this corner
> > case. i.e.  anyone using a "mmap()+write() zero copy" pattern on DAX
> > within a file is unbeleivably naive - the data still gets copied by
> > the CPU in the write() call. It's far simpler and more effcient to
> > just mmap() both ranges of the file(s) and memcpy() in userspace....
> > 
> > FWIW, it's to avoid problems with stupid userspace stuff that nobody
> > really should be doing that I want range locks for the XFS inode
> > locks.  If userspace overlaps the ranges and deadlocks in that case,
> > they they get to keep all the broken bits because, IMO, they are
> > doing something monumentally stupid. I'd probably be making it
> > return EDEADLOCK back out to userspace in the case rather than
> > deadlocking but, fundamentally, I think it's broken behaviour that
> > we should be rejecting with an error rather than adding complexity
> > trying to handle it.
> 
> I agree with this. We must just prevent user from taking the kernel down
> with maliciously created IOs...

Noted. :)

I'm still working to scale the range locks effectively for direct
IO; I've got to work out why sometimes they give identical
performance to rwsems out to 16 threads, and other times they run
20% slower or worse at 8+ threads. I'm way ahead of the original
mutex protected tree implementation that I have, but still got work
to do to get consistently close to rwsem performance for pure shared
locking workloads like direct IO.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/dax.c b/fs/dax.c
index e5e54da1715f..610bfa861a28 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1084,6 +1084,42 @@  int __dax_zero_page_range(struct block_device *bdev,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
+/*
+ * dax_copy_edges - Copies the part of the pages not included in
+ * 		    the write, but required for CoW because
+ * 		    offset/offset+length are not page aligned.
+ */
+static int dax_copy_edges(struct inode *inode, loff_t pos, loff_t length,
+			   struct iomap *iomap, void *daddr)
+{
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	loff_t end = pos + length;
+	loff_t pg_end = round_up(end, PAGE_SIZE);
+	void *saddr = iomap->inline_data;
+	int ret = 0;
+	/*
+	 * Copy the first part of the page
+	 * Note: we pass offset as length
+	 */
+	if (offset) {
+		if (saddr)
+			ret = memcpy_mcsafe(daddr, saddr, offset);
+		else
+			memset(daddr, 0, offset);
+	}
+
+	/* Copy the last part of the range */
+	if (end < pg_end) {
+		if (saddr)
+			ret = memcpy_mcsafe(daddr + offset + length,
+			       saddr + offset + length,	pg_end - end);
+		else
+			memset(daddr + offset + length, 0,
+					pg_end - end);
+	}
+	return ret;
+}
+
 static loff_t
 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
@@ -1105,9 +1141,11 @@  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			return iov_iter_zero(min(length, end - pos), iter);
 	}
 
-	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
+	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED
+			 && iomap->type != IOMAP_DAX_COW))
 		return -EIO;
 
+
 	/*
 	 * Write can allocate block for an area which has a hole page mapped
 	 * into page tables. We have to tear down these mappings so that data
@@ -1144,6 +1182,12 @@  dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 			break;
 		}
 
+		if (iomap->type == IOMAP_DAX_COW) {
+			ret = dax_copy_edges(inode, pos, length, iomap, kaddr);
+			if (ret)
+				break;
+		}
+
 		map_len = PFN_PHYS(map_len);
 		kaddr += offset;
 		map_len -= offset;
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 0fefb5455bda..6e885c5a38a3 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -25,6 +25,7 @@  struct vm_fault;
 #define IOMAP_MAPPED	0x03	/* blocks allocated at @addr */
 #define IOMAP_UNWRITTEN	0x04	/* blocks allocated at @addr in unwritten state */
 #define IOMAP_INLINE	0x05	/* data inline in the inode */
+#define IOMAP_DAX_COW	0x06	/* Copy data pointed by inline_data before write*/
 
 /*
  * Flags for all iomap mappings: